From 4ce7a39ce1b55130f4d33e694ec38238f28a2c2b Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 14 Jan 2024 14:53:33 +0100 Subject: [PATCH] Update all references --- .../python/user-guide/basics/expressions.py | 2 +- .../user-guide/expressions/aggregation.py | 4 +- .../expressions/user-defined-functions.py | 2 +- docs/src/python/user-guide/io/multiple.py | 5 ++- .../transformations/time-series/rolling.py | 9 ++--- py-polars/polars/dataframe/frame.py | 16 +++++--- py-polars/polars/dataframe/group_by.py | 30 ++++++++++++++- py-polars/polars/expr/meta.py | 8 ++-- py-polars/polars/functions/range/int_range.py | 4 +- py-polars/polars/lazyframe/frame.py | 4 +- py-polars/polars/lazyframe/group_by.py | 38 ++++++++++++++++--- py-polars/polars/type_aliases.py | 2 +- py-polars/tests/unit/dataframe/test_df.py | 6 +-- .../tests/unit/datatypes/test_categorical.py | 2 +- .../tests/unit/datatypes/test_temporal.py | 36 ++++++------------ py-polars/tests/unit/expr/test_exprs.py | 12 +++--- py-polars/tests/unit/interop/test_interop.py | 12 +++--- py-polars/tests/unit/io/test_lazy_csv.py | 4 +- py-polars/tests/unit/io/test_pickle.py | 2 +- py-polars/tests/unit/namespaces/test_meta.py | 6 +-- .../unit/operations/rolling/test_rolling.py | 4 +- .../tests/unit/operations/test_filter.py | 6 +-- .../tests/unit/operations/test_group_by.py | 8 ++-- .../unit/operations/test_group_by_dynamic.py | 12 +++--- py-polars/tests/unit/operations/test_pivot.py | 10 ++--- .../tests/unit/operations/test_random.py | 2 +- .../tests/unit/operations/test_rolling.py | 7 ++-- .../tests/unit/operations/test_window.py | 6 +-- .../tests/unit/streaming/test_streaming.py | 4 +- .../unit/streaming/test_streaming_group_by.py | 22 +++++------ py-polars/tests/unit/test_cse.py | 10 ++--- py-polars/tests/unit/test_errors.py | 2 +- py-polars/tests/unit/test_lazy.py | 6 +-- py-polars/tests/unit/test_predicates.py | 10 ++--- py-polars/tests/unit/test_projections.py | 18 +++++---- py-polars/tests/unit/test_queries.py | 8 ++-- py-polars/tests/unit/test_schema.py | 7 ++-- 37 files changed, 195 insertions(+), 151 deletions(-) diff --git a/docs/src/python/user-guide/basics/expressions.py b/docs/src/python/user-guide/basics/expressions.py index 451cf83441f0..041b023f27c4 100644 --- a/docs/src/python/user-guide/basics/expressions.py +++ b/docs/src/python/user-guide/basics/expressions.py @@ -63,7 +63,7 @@ # --8<-- [end:dataframe2] # --8<-- [start:group_by] -df2.group_by("y", maintain_order=True).count() +df2.group_by("y", maintain_order=True).len() # --8<-- [end:group_by] # --8<-- [start:group_by2] diff --git a/docs/src/python/user-guide/expressions/aggregation.py b/docs/src/python/user-guide/expressions/aggregation.py index cfcd9970573b..e25917b2de38 100644 --- a/docs/src/python/user-guide/expressions/aggregation.py +++ b/docs/src/python/user-guide/expressions/aggregation.py @@ -24,11 +24,11 @@ dataset.lazy() .group_by("first_name") .agg( - pl.count(), + pl.len(), pl.col("gender"), pl.first("last_name"), ) - .sort("count", descending=True) + .sort("len", descending=True) .limit(5) ) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 16f0da8dca76..e0658b2d36a4 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -43,7 +43,7 @@ def add_counter(val: int) -> int: out = df.select( pl.col("values").map_elements(add_counter).alias("solution_map_elements"), - (pl.col("values") + pl.int_range(1, pl.count() + 1)).alias("solution_expr"), + (pl.col("values") + pl.int_range(1, pl.len() + 1)).alias("solution_expr"), ) print(out) # --8<-- [end:counter] diff --git a/docs/src/python/user-guide/io/multiple.py b/docs/src/python/user-guide/io/multiple.py index f7500b6b6684..a718c5cd1588 100644 --- a/docs/src/python/user-guide/io/multiple.py +++ b/docs/src/python/user-guide/io/multiple.py @@ -28,12 +28,13 @@ # --8<-- [end:graph] # --8<-- [start:glob] -import polars as pl import glob +import polars as pl + queries = [] for file in glob.glob("docs/data/my_many_files_*.csv"): - q = pl.scan_csv(file).group_by("bar").agg([pl.count(), pl.sum("foo")]) + q = pl.scan_csv(file).group_by("bar").agg(pl.len(), pl.sum("foo")) queries.append(q) dataframes = pl.collect_all(queries) diff --git a/docs/src/python/user-guide/transformations/time-series/rolling.py b/docs/src/python/user-guide/transformations/time-series/rolling.py index 0a65cbc195fd..f34f56ee6d36 100644 --- a/docs/src/python/user-guide/transformations/time-series/rolling.py +++ b/docs/src/python/user-guide/transformations/time-series/rolling.py @@ -1,7 +1,8 @@ # --8<-- [start:setup] -import polars as pl from datetime import date, datetime +import polars as pl + # --8<-- [end:setup] # --8<-- [start:df] @@ -60,10 +61,6 @@ closed="both", by="groups", include_boundaries=True, -).agg( - [ - pl.count(), - ] -) +).agg(pl.len()) print(out) # --8<-- [end:group_by_dyn2] diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 4c5a5eb5ca82..9a49ce2d50d1 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5272,10 +5272,10 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> Self: └──────┴─────┴─────┘ An index column can also be created using the expressions :func:`int_range` - and :func:`count`. + and :func:`len`. >>> df.select( - ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), ... pl.all(), ... ) shape: (3, 3) @@ -7260,9 +7260,8 @@ def pivot( - None: no aggregation takes place, will raise error if multiple values are in group. - A predefined aggregate function string, one of - {'first', 'sum', 'max', 'min', 'mean', 'median', 'last', 'count'} + {'min', 'max', 'first', 'last', 'sum', 'mean', 'median', 'len'} - An expression to do the aggregation. - maintain_order Sort the grouped keys so that the output order is predictable. sort_columns @@ -7392,8 +7391,15 @@ def pivot( aggregate_expr = F.element().median()._pyexpr elif aggregate_function == "last": aggregate_expr = F.element().last()._pyexpr + elif aggregate_function == "len": + aggregate_expr = F.len()._pyexpr elif aggregate_function == "count": - aggregate_expr = F.count()._pyexpr + issue_deprecation_warning( + "`aggregate_function='count'` input for `pivot` is deprecated." + " Please use `aggregate_function='len'`.", + version="0.20.5", + ) + aggregate_expr = F.len()._pyexpr else: msg = f"invalid input for `aggregate_function` argument: {aggregate_function!r}" raise ValueError(msg) diff --git a/py-polars/polars/dataframe/group_by.py b/py-polars/polars/dataframe/group_by.py index 32668730cace..fa11d5a65946 100644 --- a/py-polars/polars/dataframe/group_by.py +++ b/py-polars/polars/dataframe/group_by.py @@ -305,7 +305,7 @@ def map_groups(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: It is better to implement this with an expression: >>> df.filter( - ... pl.int_range(0, pl.count()).shuffle().over("color") < 2 + ... pl.int_range(pl.len()).shuffle().over("color") < 2 ... ) # doctest: +IGNORE_RESULT """ by: list[str] @@ -452,6 +452,32 @@ def all(self) -> DataFrame: """ return self.agg(F.all()) + def len(self) -> DataFrame: + """ + Return the number of rows in each group. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["apple", "apple", "orange"], + ... "b": [1, None, 2], + ... } + ... ) + >>> df.group_by("a").len() # doctest: +SKIP + shape: (2, 2) + ┌────────┬─────┐ + │ a ┆ len │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞════════╪═════╡ + │ apple ┆ 2 │ + │ orange ┆ 1 │ + └────────┴─────┘ + """ + return self.agg(F.len()) + + @deprecate_renamed_function("len", version="0.20.5") def count(self) -> DataFrame: """ Return the number of rows in each group. @@ -477,7 +503,7 @@ def count(self) -> DataFrame: │ orange ┆ 1 │ └────────┴───────┘ """ - return self.agg(F.count()) + return self.agg(F.len().alias("count")) def first(self) -> DataFrame: """ diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py index 5ae3e03a0d80..4c5e0eb2eb0c 100644 --- a/py-polars/polars/expr/meta.py +++ b/py-polars/polars/expr/meta.py @@ -130,11 +130,11 @@ def output_name(self, *, raise_if_undetermined: bool = True) -> str | None: >>> e_sum_over = pl.sum("foo").over("groups") >>> e_sum_over.meta.output_name() 'foo' - >>> e_sum_slice = pl.sum("foo").slice(pl.count() - 10, pl.col("bar")) + >>> e_sum_slice = pl.sum("foo").slice(pl.len() - 10, pl.col("bar")) >>> e_sum_slice.meta.output_name() 'foo' - >>> pl.count().meta.output_name() - 'count' + >>> pl.len().meta.output_name() + 'len' """ try: return self._pyexpr.meta_output_name() @@ -180,7 +180,7 @@ def root_names(self) -> list[str]: >>> e_sum_over = pl.sum("foo").over("groups") >>> e_sum_over.meta.root_names() ['foo', 'groups'] - >>> e_sum_slice = pl.sum("foo").slice(pl.count() - 10, pl.col("bar")) + >>> e_sum_slice = pl.sum("foo").slice(pl.len() - 10, pl.col("bar")) >>> e_sum_slice.meta.root_names() ['foo', 'bar'] """ diff --git a/py-polars/polars/functions/range/int_range.py b/py-polars/polars/functions/range/int_range.py index 91518fd5f816..96633efe97c2 100644 --- a/py-polars/polars/functions/range/int_range.py +++ b/py-polars/polars/functions/range/int_range.py @@ -198,11 +198,11 @@ def int_range( 2 ] - Generate an index column using `int_range` in conjunction with :func:`count`. + Generate an index column by using `int_range` in conjunction with :func:`len`. >>> df = pl.DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]}) >>> df.select( - ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), ... pl.all(), ... ) shape: (3, 3) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index b7a93a929809..fccfa642817e 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4616,10 +4616,10 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> Self: └──────┴─────┴─────┘ An index column can also be created using the expressions :func:`int_range` - and :func:`count`. + and :func:`len`. >>> lf.select( - ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), ... pl.all(), ... ).collect() shape: (3, 3) diff --git a/py-polars/polars/lazyframe/group_by.py b/py-polars/polars/lazyframe/group_by.py index 21ae284fef43..b8e3aa588c7c 100644 --- a/py-polars/polars/lazyframe/group_by.py +++ b/py-polars/polars/lazyframe/group_by.py @@ -208,11 +208,9 @@ def map_groups( It is better to implement this with an expression: - >>> ( - ... df.lazy() - ... .filter(pl.int_range(0, pl.count()).shuffle().over("color") < 2) - ... .collect() - ... ) # doctest: +IGNORE_RESULT + >>> df.lazy().filter( + ... pl.int_range(pl.len()).shuffle().over("color") < 2 + ... ).collect() # doctest: +IGNORE_RESULT """ return wrap_ldf(self.lgb.map_groups(function, schema)) @@ -335,6 +333,34 @@ def all(self) -> LazyFrame: """ return self.agg(F.all()) + def len(self) -> LazyFrame: + """ + Return the number of rows in each group. + + Rows containing null values count towards the total. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["apple", "apple", "orange"], + ... "b": [1, None, 2], + ... } + ... ) + >>> lf.group_by("a").count().collect() # doctest: +SKIP + shape: (2, 2) + ┌────────┬───────┐ + │ a ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞════════╪═══════╡ + │ apple ┆ 2 │ + │ orange ┆ 1 │ + └────────┴───────┘ + """ + return self.agg(F.len()) + + @deprecate_renamed_function("len", version="0.20.5") def count(self) -> LazyFrame: """ Return the number of rows in each group. @@ -360,7 +386,7 @@ def count(self) -> LazyFrame: │ orange ┆ 1 │ └────────┴───────┘ """ - return self.agg(F.count()) + return self.agg(F.len().alias("count")) def first(self) -> LazyFrame: """ diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py index 7570718192de..4e00664d42c7 100644 --- a/py-polars/polars/type_aliases.py +++ b/py-polars/polars/type_aliases.py @@ -100,7 +100,7 @@ "lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd" ] PivotAgg: TypeAlias = Literal[ - "first", "sum", "max", "min", "mean", "median", "last", "count" + "min", "max", "first", "last", "sum", "mean", "median", "len" ] RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"] SizeUnit: TypeAlias = Literal[ diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 039ad3013a51..2b6d965734da 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1774,9 +1774,9 @@ def __repr__(self) -> str: def test_group_by_order_dispatch() -> None: df = pl.DataFrame({"x": list("bab"), "y": range(3)}) - result = df.group_by("x", maintain_order=True).count() + result = df.group_by("x", maintain_order=True).len() expected = pl.DataFrame( - {"x": ["b", "a"], "count": [2, 1]}, schema_overrides={"count": pl.UInt32} + {"x": ["b", "a"], "len": [2, 1]}, schema_overrides={"len": pl.UInt32} ) assert_frame_equal(result, expected) @@ -2409,7 +2409,7 @@ def test_group_by_slice_expression_args() -> None: out = ( df.group_by("groups", maintain_order=True) - .agg([pl.col("vals").slice(pl.count() * 0.1, (pl.count() // 5))]) + .agg([pl.col("vals").slice(pl.len() * 0.1, (pl.len() // 5))]) .explode("vals") ) diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index f61d708b9d58..07f7a2026305 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -124,7 +124,7 @@ def test_unset_sorted_on_append() -> None: ] ).sort("key") df = pl.concat([df1, df2], rechunk=False) - assert df.group_by("key").count()["count"].to_list() == [4, 4] + assert df.group_by("key").len()["len"].to_list() == [4, 4] @pytest.mark.parametrize( diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 6bb8e754c6ce..38679e2be9d9 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -1310,13 +1310,13 @@ def test_rolling_by_() -> None: out = ( df.sort("datetime") .rolling(index_column="datetime", by="group", period=timedelta(days=3)) - .agg([pl.count().alias("count")]) + .agg([pl.len().alias("count")]) ) expected = ( df.sort(["group", "datetime"]) .rolling(index_column="datetime", by="group", period="3d") - .agg([pl.count().alias("count")]) + .agg([pl.len().alias("count")]) ) assert_frame_equal(out.sort(["group", "datetime"]), expected) assert out.to_dict(as_series=False) == { @@ -2573,30 +2573,18 @@ def test_datetime_cum_agg_schema() -> None: def test_rolling_group_by_empty_groups_by_take_6330() -> None: - df = ( - pl.DataFrame({"Event": ["Rain", "Sun"]}) - .join( - pl.DataFrame( - { - "Date": [1, 2, 3, 4], - } - ), - how="cross", - ) - .set_sorted("Date") - ) - assert ( - df.rolling( - index_column="Date", - period="2i", - offset="-2i", - by="Event", - closed="left", - ).agg([pl.count()]) - ).to_dict(as_series=False) == { + df1 = pl.DataFrame({"Event": ["Rain", "Sun"]}) + df2 = pl.DataFrame({"Date": [1, 2, 3, 4]}) + df = df1.join(df2, how="cross").set_sorted("Date") + + result = df.rolling( + index_column="Date", period="2i", offset="-2i", by="Event", closed="left" + ).agg(pl.len()) + + assert result.to_dict(as_series=False) == { "Event": ["Rain", "Rain", "Rain", "Rain", "Sun", "Sun", "Sun", "Sun"], "Date": [1, 2, 3, 4, 1, 2, 3, 4], - "count": [0, 1, 2, 2, 0, 1, 2, 2], + "len": [0, 1, 2, 2, 0, 1, 2, 2], } diff --git a/py-polars/tests/unit/expr/test_exprs.py b/py-polars/tests/unit/expr/test_exprs.py index 4fa9008e5b20..25c9ce9df4c2 100644 --- a/py-polars/tests/unit/expr/test_exprs.py +++ b/py-polars/tests/unit/expr/test_exprs.py @@ -100,16 +100,16 @@ def test_filter_where() -> None: ] -def test_count_expr() -> None: +def test_len_expr() -> None: df = pl.DataFrame({"a": [1, 2, 3, 3, 3], "b": ["a", "a", "b", "a", "a"]}) - out = df.select(pl.count()) + out = df.select(pl.len()) assert out.shape == (1, 1) assert cast(int, out.item()) == 5 - out = df.group_by("b", maintain_order=True).agg(pl.count()) + out = df.group_by("b", maintain_order=True).agg(pl.len()) assert out["b"].to_list() == ["a", "b"] - assert out["count"].to_list() == [4, 1] + assert out["len"].to_list() == [4, 1] def test_map_alias() -> None: @@ -678,7 +678,7 @@ def test_head() -> None: assert df.select(pl.col("a").head(10)).to_dict(as_series=False) == { "a": [1, 2, 3, 4, 5] } - assert df.select(pl.col("a").head(pl.count() / 2)).to_dict(as_series=False) == { + assert df.select(pl.col("a").head(pl.len() / 2)).to_dict(as_series=False) == { "a": [1, 2] } @@ -690,7 +690,7 @@ def test_tail() -> None: assert df.select(pl.col("a").tail(10)).to_dict(as_series=False) == { "a": [1, 2, 3, 4, 5] } - assert df.select(pl.col("a").tail(pl.count() / 2)).to_dict(as_series=False) == { + assert df.select(pl.col("a").tail(pl.len() / 2)).to_dict(as_series=False) == { "a": [4, 5] } diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index ee886455a67b..55154c540d65 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -1038,11 +1038,13 @@ def test_to_init_repr() -> None: def test_untrusted_categorical_input() -> None: - df = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])}) - assert pl.from_pandas(df).group_by("x").count().to_dict(as_series=False) == { - "x": ["x"], - "count": [1], - } + df_pd = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])}) + df = pl.from_pandas(df_pd) + result = df.group_by("x").len() + expected = pl.DataFrame( + {"x": ["x"], "len": [1]}, schema={"x": pl.Categorical, "len": pl.UInt32} + ) + assert_frame_equal(result, expected, categorical_as_str=True) def test_sliced_struct_from_arrow() -> None: diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index f80e5f44d0da..22e57462ae49 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -252,10 +252,10 @@ def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: Path) -> foods_file_path, dtypes={"calories": pl.String, "sugars_g": pl.Int8}, ) - .select(pl.count()) + .select(pl.len()) .collect() ) - expected = pl.DataFrame({"count": 27}, schema={"count": pl.UInt32}) + expected = pl.DataFrame({"len": 27}, schema={"len": pl.UInt32}) assert_frame_equal(df, expected) diff --git a/py-polars/tests/unit/io/test_pickle.py b/py-polars/tests/unit/io/test_pickle.py index 5e307228a67a..57cd6d954b9d 100644 --- a/py-polars/tests/unit/io/test_pickle.py +++ b/py-polars/tests/unit/io/test_pickle.py @@ -19,7 +19,7 @@ def test_pickle() -> None: def test_pickle_expr() -> None: - for e in [pl.all(), pl.count()]: + for e in [pl.all(), pl.len()]: f = io.BytesIO() pickle.dump(e, f) diff --git a/py-polars/tests/unit/namespaces/test_meta.py b/py-polars/tests/unit/namespaces/test_meta.py index 93916daa3fa3..fe554c694491 100644 --- a/py-polars/tests/unit/namespaces/test_meta.py +++ b/py-polars/tests/unit/namespaces/test_meta.py @@ -34,12 +34,12 @@ def test_root_and_output_names() -> None: assert e.meta.output_name() == "foo" assert e.meta.root_names() == ["foo", "groups"] - e = pl.sum("foo").slice(pl.count() - 10, pl.col("bar")) + e = pl.sum("foo").slice(pl.len() - 10, pl.col("bar")) assert e.meta.output_name() == "foo" assert e.meta.root_names() == ["foo", "bar"] - e = pl.count() - assert e.meta.output_name() == "count" + e = pl.len() + assert e.meta.output_name() == "len" with pytest.raises( pl.ComputeError, diff --git a/py-polars/tests/unit/operations/rolling/test_rolling.py b/py-polars/tests/unit/operations/rolling/test_rolling.py index bad3d307c94e..e30cc160f505 100644 --- a/py-polars/tests/unit/operations/rolling/test_rolling.py +++ b/py-polars/tests/unit/operations/rolling/test_rolling.py @@ -230,7 +230,7 @@ def test_rolling_extrema() -> None: ) ).with_columns( [ - pl.when(pl.int_range(0, pl.count(), eager=False) < 2) + pl.when(pl.int_range(0, pl.len(), eager=False) < 2) .then(None) .otherwise(pl.all()) .name.suffix("_nulls") @@ -815,7 +815,7 @@ def test_index_expr_with_literal() -> None: def test_index_expr_output_name_12244() -> None: df = pl.DataFrame({"A": [1, 2, 3]}) - out = df.rolling(pl.int_range(0, pl.count()), period="2i").agg("A") + out = df.rolling(pl.int_range(0, pl.len()), period="2i").agg("A") assert out.to_dict(as_series=False) == { "literal": [0, 1, 2], "A": [[1], [1, 2], [2, 3]], diff --git a/py-polars/tests/unit/operations/test_filter.py b/py-polars/tests/unit/operations/test_filter.py index 3ade166f7422..533eadd37339 100644 --- a/py-polars/tests/unit/operations/test_filter.py +++ b/py-polars/tests/unit/operations/test_filter.py @@ -131,7 +131,7 @@ def test_predicate_order_explode_5950() -> None: assert ( df.lazy() .explode("i") - .filter(pl.count().over(["i"]) == 2) + .filter(pl.len().over(["i"]) == 2) .filter(pl.col("n").is_not_null()) ).collect().to_dict(as_series=False) == {"i": [1], "n": [0]} @@ -184,8 +184,8 @@ def test_clear_window_cache_after_filter_10499() -> None: } ) - assert df.lazy().filter((pl.col("a").null_count() < pl.count()).over("b")).filter( - ((pl.col("a") == 0).sum() < pl.count()).over("b") + assert df.lazy().filter((pl.col("a").null_count() < pl.len()).over("b")).filter( + ((pl.col("a") == 0).sum() < pl.len()).over("b") ).collect().to_dict(as_series=False) == { "a": [3, None, 5, 0, 9, 10], "b": [2, 2, 3, 3, 5, 5], diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index cebbbb106ab3..0b334568b073 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -250,7 +250,7 @@ def df() -> pl.DataFrame: ("method", "expected"), [ ("all", [("a", [1, 2], [None, 1]), ("b", [3, 4, 5], [None, 1, None])]), - ("count", [("a", 2), ("b", 3)]), + ("len", [("a", 2), ("b", 3)]), ("first", [("a", 1, None), ("b", 3, None)]), ("last", [("a", 2, 1), ("b", 5, None)]), ("max", [("a", 2, 1), ("b", 5, 1)]), @@ -763,7 +763,7 @@ def test_perfect_hash_table_null_values() -> None: def test_group_by_partitioned_ending_cast(monkeypatch: Any) -> None: monkeypatch.setenv("POLARS_FORCE_PARTITION", "1") df = pl.DataFrame({"a": [1] * 5, "b": [1] * 5}) - out = df.group_by(["a", "b"]).agg(pl.count().cast(pl.Int64).alias("num")) + out = df.group_by(["a", "b"]).agg(pl.len().cast(pl.Int64).alias("num")) expected = pl.DataFrame({"a": [1], "b": [1], "num": [5]}) assert_frame_equal(out, expected) @@ -890,8 +890,8 @@ def test_group_by_with_expr_as_key() -> None: def test_lazy_group_by_reuse_11767() -> None: lgb = pl.select(x=1).lazy().group_by("x") - a = lgb.count() - b = lgb.count() + a = lgb.len() + b = lgb.len() assert_frame_equal(a, b) diff --git a/py-polars/tests/unit/operations/test_group_by_dynamic.py b/py-polars/tests/unit/operations/test_group_by_dynamic.py index 1f6799dd8005..9404b22ea52a 100644 --- a/py-polars/tests/unit/operations/test_group_by_dynamic.py +++ b/py-polars/tests/unit/operations/test_group_by_dynamic.py @@ -113,7 +113,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: include_boundaries=True, label="datapoint", start_by="datapoint", - ).agg(pl.count()).to_dict(as_series=False) == { + ).agg(pl.len()).to_dict(as_series=False) == { "_lower_boundary": [ datetime(2022, 12, 16, 0, 0, tzinfo=tzinfo), datetime(2022, 12, 16, 0, 31, tzinfo=tzinfo), @@ -138,7 +138,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: datetime(2022, 12, 16, 2, 30, tzinfo=tzinfo), datetime(2022, 12, 16, 3, 0, tzinfo=tzinfo), ], - "count": [2, 1, 1, 1, 1, 1], + "len": [2, 1, 1, 1, 1, 1], } # start by monday @@ -156,7 +156,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: include_boundaries=True, start_by="monday", label="datapoint", - ).agg([pl.count(), pl.col("day").first().alias("data_day")]) + ).agg([pl.len(), pl.col("day").first().alias("data_day")]) assert result.to_dict(as_series=False) == { "_lower_boundary": [ datetime(2022, 1, 3, 0, 0, tzinfo=tzinfo), @@ -170,7 +170,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: datetime(2022, 1, 3, 0, 0, tzinfo=tzinfo), datetime(2022, 1, 10, 0, 0, tzinfo=tzinfo), ], - "count": [6, 5], + "len": [6, 5], "data_day": [1, 1], } # start by saturday @@ -181,7 +181,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: include_boundaries=True, start_by="saturday", label="datapoint", - ).agg([pl.count(), pl.col("day").first().alias("data_day")]) + ).agg([pl.len(), pl.col("day").first().alias("data_day")]) assert result.to_dict(as_series=False) == { "_lower_boundary": [ datetime(2022, 1, 1, 0, 0, tzinfo=tzinfo), @@ -195,7 +195,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: datetime(2022, 1, 1, 0, 0, tzinfo=tzinfo), datetime(2022, 1, 8, 0, 0, tzinfo=tzinfo), ], - "count": [6, 6], + "len": [6, 6], "data_day": [6, 6], } diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index 4f606f99b6e3..097a9f93a453 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -56,7 +56,7 @@ def test_pivot_list() -> None: ("agg_fn", "expected_rows"), [ ("first", [("a", 2, None, None), ("b", None, None, 10)]), - ("count", [("a", 2, None, None), ("b", None, 2, 1)]), + ("len", [("a", 2, None, None), ("b", None, 2, 1)]), ("min", [("a", 2, None, None), ("b", None, 8, 10)]), ("max", [("a", 4, None, None), ("b", None, 8, 10)]), ("sum", [("a", 6, None, None), ("b", None, 8, 10)]), @@ -106,14 +106,12 @@ def test_pivot_categorical_index() -> None: schema=[("A", pl.Categorical), ("B", pl.Categorical)], ) - result = df.pivot(values="B", index=["A"], columns="B", aggregate_function="count") + result = df.pivot(values="B", index=["A"], columns="B", aggregate_function="len") expected = {"A": ["Fire", "Water"], "Car": [1, 2], "Ship": [1, None]} assert result.to_dict(as_series=False) == expected # test expression dispatch - result = df.pivot( - values="B", index=["A"], columns="B", aggregate_function=pl.count() - ) + result = df.pivot(values="B", index=["A"], columns="B", aggregate_function=pl.len()) assert result.to_dict(as_series=False) == expected df = pl.DataFrame( @@ -125,7 +123,7 @@ def test_pivot_categorical_index() -> None: schema=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)], ) result = df.pivot( - values="B", index=["A", "C"], columns="B", aggregate_function="count" + values="B", index=["A", "C"], columns="B", aggregate_function="len" ) expected = { "A": ["Fire", "Water"], diff --git a/py-polars/tests/unit/operations/test_random.py b/py-polars/tests/unit/operations/test_random.py index 328373a65f44..71195f46d239 100644 --- a/py-polars/tests/unit/operations/test_random.py +++ b/py-polars/tests/unit/operations/test_random.py @@ -14,7 +14,7 @@ def unique_shuffle_groups(n: int, seed: int | None) -> int: shuffled = df.group_by("group", maintain_order=True).agg( pl.col("l").shuffle(seed) ) - num_unique = shuffled.group_by("l").agg(pl.lit(0)).select(pl.count()) + num_unique = shuffled.group_by("l").agg(pl.lit(0)).select(pl.len()) return int(num_unique[0, 0]) assert unique_shuffle_groups(50, None) > 1 # Astronomically unlikely. diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py index c9c4ff5e95c8..ddde2576462e 100644 --- a/py-polars/tests/unit/operations/test_rolling.py +++ b/py-polars/tests/unit/operations/test_rolling.py @@ -57,9 +57,10 @@ def test_rolling_negative_offset_3914() -> None: ), } ) - assert df.rolling(index_column="datetime", period="2d", offset="-4d").agg( - pl.count().alias("count") - )["count"].to_list() == [0, 0, 1, 2, 2] + result = df.rolling(index_column="datetime", period="2d", offset="-4d").agg( + pl.len() + ) + assert result["len"].to_list() == [0, 0, 1, 2, 2] df = pl.DataFrame( { diff --git a/py-polars/tests/unit/operations/test_window.py b/py-polars/tests/unit/operations/test_window.py index ce4c3dd8ceff..0e23df2dc015 100644 --- a/py-polars/tests/unit/operations/test_window.py +++ b/py-polars/tests/unit/operations/test_window.py @@ -118,7 +118,7 @@ def test_window_function_cache() -> None: def test_window_range_no_rows() -> None: df = pl.DataFrame({"x": [5, 5, 4, 4, 2, 2]}) - expr = pl.int_range(0, pl.count()).over("x") + expr = pl.int_range(0, pl.len()).over("x") out = df.with_columns(int=expr) assert_frame_equal( out, pl.DataFrame({"x": [5, 5, 4, 4, 2, 2], "int": [0, 1, 0, 1, 0, 1]}) @@ -193,14 +193,14 @@ def test_cumulative_eval_window_functions() -> None: assert_frame_equal(result, expected) -def test_count_window() -> None: +def test_len_window() -> None: assert ( pl.DataFrame( { "a": [1, 1, 2], } ) - .with_columns(pl.count().over("a"))["count"] + .with_columns(pl.len().over("a"))["len"] .to_list() ) == [2, 2, 1] diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py index 2d318874aace..fd18289fdc86 100644 --- a/py-polars/tests/unit/streaming/test_streaming.py +++ b/py-polars/tests/unit/streaming/test_streaming.py @@ -232,12 +232,12 @@ def test_streaming_9776() -> None: df = pl.DataFrame({"col_1": ["a"] * 1000, "ID": [None] + ["a"] * 999}) ordered = ( df.group_by("col_1", "ID", maintain_order=True) - .count() + .len() .filter(pl.col("col_1") == "a") ) unordered = ( df.group_by("col_1", "ID", maintain_order=False) - .count() + .len() .filter(pl.col("col_1") == "a") ) expected = [("a", None, 1), ("a", "a", 999)] diff --git a/py-polars/tests/unit/streaming/test_streaming_group_by.py b/py-polars/tests/unit/streaming/test_streaming_group_by.py index e4ad5117b5b9..35715f18179c 100644 --- a/py-polars/tests/unit/streaming/test_streaming_group_by.py +++ b/py-polars/tests/unit/streaming/test_streaming_group_by.py @@ -26,12 +26,12 @@ def test_streaming_group_by_sorted_fast_path_nulls_10273() -> None: df.set_sorted("x") .lazy() .group_by("x") - .agg(pl.count()) + .agg(pl.len()) .collect(streaming=True) .sort("x") ).to_dict(as_series=False) == { "x": [None, 0, 1, 2, 3], - "count": [100, 100, 100, 100, 100], + "len": [100, 100, 100, 100, 100], } @@ -147,18 +147,14 @@ def test_streaming_group_by_min_max() -> None: def test_streaming_non_streaming_gb() -> None: n = 100 df = pl.DataFrame({"a": np.random.randint(0, 20, n)}) - q = df.lazy().group_by("a").agg(pl.count()).sort("a") + q = df.lazy().group_by("a").agg(pl.len()).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) q = df.lazy().with_columns(pl.col("a").cast(pl.String)) - q = q.group_by("a").agg(pl.count()).sort("a") + q = q.group_by("a").agg(pl.len()).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) q = df.lazy().with_columns(pl.col("a").alias("b")) - q = ( - q.group_by(["a", "b"]) - .agg(pl.count(), pl.col("a").sum().alias("sum_a")) - .sort("a") - ) + q = q.group_by(["a", "b"]).agg(pl.len(), pl.col("a").sum().alias("sum_a")).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) @@ -289,11 +285,11 @@ def test_streaming_group_by_struct_key() -> None: {"A": [1, 2, 3, 2], "B": ["google", "ms", "apple", "ms"], "C": [2, 3, 4, 3]} ) df1 = df.lazy().with_columns(pl.struct(["A", "C"]).alias("tuples")) - assert df1.group_by("tuples").agg(pl.count(), pl.col("B").first()).sort( - "B" - ).collect(streaming=True).to_dict(as_series=False) == { + assert df1.group_by("tuples").agg(pl.len(), pl.col("B").first()).sort("B").collect( + streaming=True + ).to_dict(as_series=False) == { "tuples": [{"A": 3, "C": 4}, {"A": 1, "C": 2}, {"A": 2, "C": 3}], - "count": [1, 1, 2], + "len": [1, 1, 2], "B": ["apple", "google", "ms"], } diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index fdb1ef67a0db..b9fac236edba 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -469,7 +469,7 @@ def test_cse_count_in_group_by() -> None: q = ( pl.LazyFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [40, 51, 12]}) .group_by("a") - .agg(pl.all().slice(0, pl.count() - 1)) + .agg(pl.all().slice(0, pl.len() - 1)) ) assert "POLARS_CSER" not in q.explain() @@ -527,8 +527,8 @@ def test_cse_slice_11594() -> None: df = pl.LazyFrame({"a": [1, 2, 1, 2, 1, 2]}) q = df.select( - pl.col("a").slice(offset=1, length=pl.count() - 1).alias("1"), - pl.col("a").slice(offset=1, length=pl.count() - 1).alias("2"), + pl.col("a").slice(offset=1, length=pl.len() - 1).alias("1"), + pl.col("a").slice(offset=1, length=pl.len() - 1).alias("2"), ) assert "__POLARS_CSE" in q.explain(comm_subexpr_elim=True) @@ -539,8 +539,8 @@ def test_cse_slice_11594() -> None: } q = df.select( - pl.col("a").slice(offset=1, length=pl.count() - 1).alias("1"), - pl.col("a").slice(offset=0, length=pl.count() - 1).alias("2"), + pl.col("a").slice(offset=1, length=pl.len() - 1).alias("1"), + pl.col("a").slice(offset=0, length=pl.len() - 1).alias("2"), ) assert "__POLARS_CSE" in q.explain(comm_subexpr_elim=True) diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index 34530d10772c..dda6fe590737 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -20,7 +20,7 @@ def test_error_on_empty_group_by() -> None: with pytest.raises( pl.ComputeError, match="at least one key is required in a group_by operation" ): - pl.DataFrame({"x": [0, 0, 1, 1]}).group_by([]).agg(pl.count()) + pl.DataFrame({"x": [0, 0, 1, 1]}).group_by([]).agg(pl.len()) def test_error_on_reducing_map() -> None: diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index a6957ca18c88..85330e1d4f17 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -146,11 +146,11 @@ def test_count_suffix_10783() -> None: } ) df_with_cnt = df.with_columns( - pl.count() + pl.len() .over(pl.col("a").list.sort().list.join("").hash()) .name.suffix("_suffix") ) - df_expect = df.with_columns(pl.Series("count_suffix", [3, 3, 1, 3])) + df_expect = df.with_columns(pl.Series("len_suffix", [3, 3, 1, 3])) assert_frame_equal(df_with_cnt, df_expect, check_dtype=False) @@ -1197,7 +1197,7 @@ def test_predicate_count_vstack() -> None: "v": [5, 7], } ) - assert pl.concat([l1, l2]).filter(pl.count().over("k") == 2).collect()[ + assert pl.concat([l1, l2]).filter(pl.len().over("k") == 2).collect()[ "v" ].to_list() == [3, 2, 5, 7] diff --git a/py-polars/tests/unit/test_predicates.py b/py-polars/tests/unit/test_predicates.py index c6c0147e3fd0..811b5c82c32f 100644 --- a/py-polars/tests/unit/test_predicates.py +++ b/py-polars/tests/unit/test_predicates.py @@ -203,7 +203,7 @@ def test_predicate_pushdown_group_by_keys() -> None: assert ( 'SELECTION: "None"' not in df.group_by("group") - .agg([pl.count().alias("str_list")]) + .agg([pl.len().alias("str_list")]) .filter(pl.col("group") == 1) .explain() ) @@ -388,16 +388,16 @@ def test_predicate_pushdown_with_window_projections_12637() -> None: # that only refers to the common window keys. actual = lf.with_columns( (pl.col("value") * 2).over("key").alias("value_2"), - ).filter(pl.count().over("key") == 1) + ).filter(pl.len().over("key") == 1) plan = actual.explain() - assert r'FILTER [(count().over([col("key")])) == (1)]' in plan + assert r'FILTER [(len().over([col("key")])) == (1)]' in plan assert 'SELECTION: "None"' in plan # Test window in filter - actual = lf.filter(pl.count().over("key") == 1).filter(pl.col("key") == 1) + actual = lf.filter(pl.len().over("key") == 1).filter(pl.col("key") == 1) plan = actual.explain() - assert r'FILTER [(count().over([col("key")])) == (1)]' in plan + assert r'FILTER [(len().over([col("key")])) == (1)]' in plan assert r'SELECTION: "[(col(\"key\")) == (1)]"' in plan diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index 199c5b6f2659..51bd070e08b7 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -275,18 +275,22 @@ def test_merge_sorted_projection_pd() -> None: def test_distinct_projection_pd_7578() -> None: - df = pl.DataFrame( + lf = pl.LazyFrame( { "foo": ["0", "1", "2", "1", "2"], "bar": ["a", "a", "a", "b", "b"], } ) - q = df.lazy().unique().group_by("bar").agg(pl.count()) - assert q.collect().sort("bar").to_dict(as_series=False) == { - "bar": ["a", "b"], - "count": [3, 2], - } + result = lf.unique().group_by("bar").agg(pl.len()) + expected = pl.LazyFrame( + { + "bar": ["a", "b"], + "len": [3, 2], + }, + schema_overrides={"len": pl.UInt32}, + ) + assert_frame_equal(result, expected) def test_join_suffix_collision_9562() -> None: @@ -351,7 +355,7 @@ def test_projection_rename_10595() -> None: def test_projection_count_11841() -> None: - pl.LazyFrame({"x": 1}).select(records=pl.count()).select( + pl.LazyFrame({"x": 1}).select(records=pl.len()).select( pl.lit(1).alias("x"), pl.all() ).collect() diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py index 08edd662a7d4..1a28b608ae06 100644 --- a/py-polars/tests/unit/test_queries.py +++ b/py-polars/tests/unit/test_queries.py @@ -34,7 +34,7 @@ def test_repeat_expansion_in_group_by() -> None: out = ( pl.DataFrame({"g": [1, 2, 2, 3, 3, 3]}) .group_by("g", maintain_order=True) - .agg(pl.repeat(1, pl.count()).cum_sum()) + .agg(pl.repeat(1, pl.len()).cum_sum()) .to_dict(as_series=False) ) assert out == {"g": [1, 2, 3], "repeat": [[1], [1, 2], [1, 2, 3]]} @@ -126,10 +126,10 @@ def test_sorted_group_by_optimization(monkeypatch: Any) -> None: sorted_implicit = ( df.with_columns(pl.col("a").sort(descending=descending)) .group_by("a") - .agg(pl.count()) + .agg(pl.len()) ) sorted_explicit = ( - df.group_by("a").agg(pl.count()).sort("a", descending=descending) + df.group_by("a").agg(pl.len()).sort("a", descending=descending) ) assert_frame_equal(sorted_explicit, sorted_implicit) @@ -258,7 +258,7 @@ def map_expr(name: str) -> pl.Expr: pl.struct( [ pl.sum(name).alias("sum"), - (pl.count() - pl.col(name).null_count()).alias("count"), + (pl.len() - pl.col(name).null_count()).alias("count"), ] ), ) diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 673c3fa86dcf..f5eb9a8e4b57 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -150,8 +150,7 @@ def test_bool_numeric_supertype() -> None: pl.Int64, ]: assert ( - df.select([(pl.col("v") < 3).sum().cast(dt) / pl.count()]).item() - - 0.3333333 + df.select([(pl.col("v") < 3).sum().cast(dt) / pl.len()]).item() - 0.3333333 <= 0.00001 ) @@ -631,5 +630,5 @@ def test_literal_subtract_schema_13284() -> None: pl.LazyFrame({"a": [23, 30]}, schema={"a": pl.UInt8}) .with_columns(pl.col("a") - pl.lit(1)) .group_by(by="a") - .count() - ).schema == OrderedDict([("a", pl.UInt8), ("count", pl.UInt32)]) + .len() + ).schema == OrderedDict([("a", pl.UInt8), ("len", pl.UInt32)])