From 3fb7ab79326dd84f090d31bf669b59325919cb5d Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 16 Jan 2024 16:35:38 +0100 Subject: [PATCH] depr(python,rust!): Rename `pl.count()` to `pl.len()` (#13719) --- .../src/physical_plan/expressions/count.rs | 12 +- .../src/physical_plan/planner/expr.rs | 6 +- .../src/physical_plan/planner/lp.rs | 6 +- crates/polars-lazy/src/tests/aggregations.rs | 4 +- crates/polars-lazy/src/tests/queries.rs | 2 +- .../sinks/group_by/aggregates/convert.rs | 18 +-- crates/polars-plan/src/dsl/consts.rs | 2 +- crates/polars-plan/src/dsl/expr.rs | 5 +- crates/polars-plan/src/dsl/functions/index.rs | 2 +- crates/polars-plan/src/dsl/mod.rs | 18 +-- .../polars-plan/src/logical_plan/aexpr/mod.rs | 12 +- .../src/logical_plan/aexpr/schema.rs | 2 +- .../src/logical_plan/conversion.rs | 4 +- crates/polars-plan/src/logical_plan/format.rs | 2 +- .../polars-plan/src/logical_plan/iterator.rs | 2 +- .../src/logical_plan/optimizer/cse_expr.rs | 2 +- .../optimizer/predicate_pushdown/group_by.rs | 2 +- .../optimizer/predicate_pushdown/mod.rs | 2 +- .../projection_pushdown/projection.rs | 2 +- .../src/logical_plan/tree_format.rs | 2 +- .../src/logical_plan/visitor/expr.rs | 2 +- crates/polars-plan/src/utils.rs | 6 +- crates/polars-sql/src/functions.rs | 6 +- .../polars/tests/it/lazy/expressions/apply.rs | 2 +- .../polars/tests/it/lazy/expressions/arity.rs | 2 +- .../polars/tests/it/lazy/expressions/slice.rs | 2 +- .../tests/it/lazy/expressions/window.rs | 2 +- .../polars/tests/it/lazy/predicate_queries.rs | 2 +- .../python/user-guide/basics/expressions.py | 2 +- .../user-guide/expressions/aggregation.py | 4 +- .../expressions/user-defined-functions.py | 2 +- docs/src/python/user-guide/io/multiple.py | 5 +- .../transformations/time-series/rolling.py | 9 +- .../src/rust/user-guide/basics/expressions.rs | 7 +- .../user-guide/expressions/aggregation.rs | 4 +- .../rust/user-guide/expressions/structs.rs | 6 +- .../transformations/time-series/rolling.rs | 2 +- py-polars/polars/__init__.py | 3 + py-polars/polars/dataframe/frame.py | 16 ++- py-polars/polars/dataframe/group_by.py | 30 ++++- py-polars/polars/expr/meta.py | 8 +- py-polars/polars/functions/__init__.py | 3 + py-polars/polars/functions/lazy.py | 110 ++++++------------ py-polars/polars/functions/len.py | 67 +++++++++++ py-polars/polars/functions/range/int_range.py | 4 +- py-polars/polars/lazyframe/frame.py | 4 +- py-polars/polars/lazyframe/group_by.py | 38 +++++- py-polars/polars/type_aliases.py | 2 +- py-polars/src/functions/lazy.rs | 9 +- py-polars/src/lib.rs | 4 +- py-polars/tests/unit/dataframe/test_df.py | 6 +- .../tests/unit/datatypes/test_categorical.py | 2 +- .../tests/unit/datatypes/test_temporal.py | 36 ++---- py-polars/tests/unit/expr/test_exprs.py | 12 +- .../tests/unit/functions/test_cum_count.py | 3 +- py-polars/tests/unit/interop/test_interop.py | 12 +- py-polars/tests/unit/io/test_lazy_csv.py | 4 +- py-polars/tests/unit/io/test_pickle.py | 2 +- py-polars/tests/unit/namespaces/test_meta.py | 6 +- .../unit/operations/rolling/test_rolling.py | 4 +- .../tests/unit/operations/test_filter.py | 6 +- .../tests/unit/operations/test_group_by.py | 8 +- .../unit/operations/test_group_by_dynamic.py | 12 +- py-polars/tests/unit/operations/test_pivot.py | 10 +- .../tests/unit/operations/test_random.py | 2 +- .../tests/unit/operations/test_rolling.py | 7 +- .../tests/unit/operations/test_window.py | 6 +- .../tests/unit/streaming/test_streaming.py | 4 +- .../unit/streaming/test_streaming_group_by.py | 22 ++-- py-polars/tests/unit/test_cse.py | 10 +- py-polars/tests/unit/test_errors.py | 2 +- py-polars/tests/unit/test_lazy.py | 6 +- py-polars/tests/unit/test_predicates.py | 10 +- py-polars/tests/unit/test_projections.py | 18 +-- py-polars/tests/unit/test_queries.py | 8 +- py-polars/tests/unit/test_schema.py | 7 +- 76 files changed, 376 insertions(+), 319 deletions(-) create mode 100644 py-polars/polars/functions/len.py diff --git a/crates/polars-lazy/src/physical_plan/expressions/count.rs b/crates/polars-lazy/src/physical_plan/expressions/count.rs index 6dc754d59dc1..2479507bdf30 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/count.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/count.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use polars_core::prelude::*; -use polars_plan::dsl::consts::COUNT; +use polars_plan::dsl::consts::LEN; use crate::physical_plan::state::ExecutionState; use crate::prelude::*; @@ -12,7 +12,7 @@ pub struct CountExpr { impl CountExpr { pub(crate) fn new() -> Self { - Self { expr: Expr::Count } + Self { expr: Expr::Len } } } @@ -22,7 +22,7 @@ impl PhysicalExpr for CountExpr { } fn evaluate(&self, df: &DataFrame, _state: &ExecutionState) -> PolarsResult { - Ok(Series::new("count", [df.height() as IdxSize])) + Ok(Series::new("len", [df.height() as IdxSize])) } fn evaluate_on_groups<'a>( @@ -31,13 +31,13 @@ impl PhysicalExpr for CountExpr { groups: &'a GroupsProxy, _state: &ExecutionState, ) -> PolarsResult> { - let ca = groups.group_count().with_name(COUNT); + let ca = groups.group_count().with_name(LEN); let s = ca.into_series(); Ok(AggregationContext::new(s, Cow::Borrowed(groups), true)) } fn to_field(&self, _input_schema: &Schema) -> PolarsResult { - Ok(Field::new(COUNT, IDX_DTYPE)) + Ok(Field::new(LEN, IDX_DTYPE)) } fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { @@ -67,6 +67,6 @@ impl PartitionedAggregation for CountExpr { ) -> PolarsResult { // SAFETY: groups are in bounds. let agg = unsafe { partitioned.agg_sum(groups) }; - Ok(agg.with_name(COUNT)) + Ok(agg.with_name(LEN)) } } diff --git a/crates/polars-lazy/src/physical_plan/planner/expr.rs b/crates/polars-lazy/src/physical_plan/planner/expr.rs index 39d68c8901fb..5028bd3ed36d 100644 --- a/crates/polars-lazy/src/physical_plan/planner/expr.rs +++ b/crates/polars-lazy/src/physical_plan/planner/expr.rs @@ -91,7 +91,7 @@ pub(crate) fn create_physical_expr( use AExpr::*; match expr_arena.get(expression).clone() { - Count => Ok(Arc::new(phys_expr::CountExpr::new())), + Len => Ok(Arc::new(phys_expr::CountExpr::new())), Window { mut function, partition_by, @@ -129,8 +129,8 @@ pub(crate) fn create_physical_expr( if apply_columns.is_empty() { if has_aexpr(function, expr_arena, |e| matches!(e, AExpr::Literal(_))) { apply_columns.push(Arc::from("literal")) - } else if has_aexpr(function, expr_arena, |e| matches!(e, AExpr::Count)) { - apply_columns.push(Arc::from("count")) + } else if has_aexpr(function, expr_arena, |e| matches!(e, AExpr::Len)) { + apply_columns.push(Arc::from("len")) } else { let e = node_to_expr(function, expr_arena); polars_bail!( diff --git a/crates/polars-lazy/src/physical_plan/planner/lp.rs b/crates/polars-lazy/src/physical_plan/planner/lp.rs index 1521018e89d5..ea47cf3308dc 100644 --- a/crates/polars-lazy/src/physical_plan/planner/lp.rs +++ b/crates/polars-lazy/src/physical_plan/planner/lp.rs @@ -38,7 +38,7 @@ fn partitionable_gb( let depth = (expr_arena).iter(*agg).count(); // These single expressions are partitionable - if matches!(aexpr, AExpr::Count) { + if matches!(aexpr, AExpr::Len) { continue; } // col() @@ -55,7 +55,7 @@ fn partitionable_gb( // count().alias() is allowed: count of 2 if depth <= 2 { match expr_arena.get(*input) { - AExpr::Count => {}, + AExpr::Len => {}, _ => { partitionable = false; break; @@ -103,7 +103,7 @@ fn partitionable_gb( Ternary {truthy, falsy, predicate,..} => { !has_aggregation(*truthy) && !has_aggregation(*falsy) && !has_aggregation(*predicate) } - Column(_) | Alias(_, _) | Count | Literal(_) | Cast {..} => { + Column(_) | Alias(_, _) | Len | Literal(_) | Cast {..} => { true } _ => { diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs index 9f46823ae750..c54e584f2731 100644 --- a/crates/polars-lazy/src/tests/aggregations.rs +++ b/crates/polars-lazy/src/tests/aggregations.rs @@ -243,8 +243,8 @@ fn test_binary_agg_context_0() -> PolarsResult<()> { .lazy() .group_by_stable([col("groups")]) .agg([when(col("vals").first().neq(lit(1))) - .then(repeat(lit("a"), count())) - .otherwise(repeat(lit("b"), count())) + .then(repeat(lit("a"), len())) + .otherwise(repeat(lit("b"), len())) .alias("foo")]) .collect() .unwrap(); diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index 85f1b7ac86dc..4d997343e68b 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -1807,7 +1807,7 @@ fn test_partitioned_gb_count() -> PolarsResult<()> { .group_by([col("col")]) .agg([ // we make sure to alias with a different name - count().alias("counted"), + len().alias("counted"), col("col").count().alias("count2"), ]) .collect()?; diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs index 9581d6ca3350..16826434a96d 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs @@ -23,14 +23,14 @@ use crate::executors::sinks::group_by::aggregates::{AggregateFunction, SumAgg}; use crate::expressions::PhysicalPipedExpr; use crate::operators::DataChunk; -struct Count {} +struct Len {} -impl PhysicalIoExpr for Count { +impl PhysicalIoExpr for Len { fn evaluate_io(&self, _df: &DataFrame) -> PolarsResult { unimplemented!() } } -impl PhysicalPipedExpr for Count { +impl PhysicalPipedExpr for Len { fn evaluate(&self, chunk: &DataChunk, _lazy_state: &dyn Any) -> PolarsResult { // the length must match the chunks as the operators expect that // so we fill a null series. @@ -42,7 +42,7 @@ impl PhysicalPipedExpr for Count { } fn expression(&self) -> Expr { - Expr::Count + Expr::Len } } @@ -57,7 +57,7 @@ pub fn can_convert_to_hash_agg( .map(|(_, ae)| { match ae { AExpr::Agg(_) - | AExpr::Count + | AExpr::Len | AExpr::Cast { .. } | AExpr::Literal(_) | AExpr::Column(_) @@ -70,7 +70,7 @@ pub fn can_convert_to_hash_agg( } ae }) - .filter(|ae| matches!(ae, AExpr::Agg(_) | AExpr::Count)) + .filter(|ae| matches!(ae, AExpr::Agg(_) | AExpr::Len)) .count() == 1 && can_run_partitioned @@ -80,7 +80,7 @@ pub fn can_convert_to_hash_agg( node = *input } match expr_arena.get(node) { - AExpr::Count => true, + AExpr::Len => true, ae @ AExpr::Agg(agg_fn) => { matches!( agg_fn, @@ -128,9 +128,9 @@ where { match expr_arena.get(node) { AExpr::Alias(input, _) => convert_to_hash_agg(*input, expr_arena, schema, to_physical), - AExpr::Count => ( + AExpr::Len => ( IDX_DTYPE, - Arc::new(Count {}), + Arc::new(Len {}), AggregateFunction::Count(CountAgg::new()), ), AExpr::Agg(agg) => match agg { diff --git a/crates/polars-plan/src/dsl/consts.rs b/crates/polars-plan/src/dsl/consts.rs index bc8314bc71ec..acc4b4649fc2 100644 --- a/crates/polars-plan/src/dsl/consts.rs +++ b/crates/polars-plan/src/dsl/consts.rs @@ -1,3 +1,3 @@ -pub const COUNT: &str = "count"; +pub const LEN: &str = "len"; pub const LITERAL_NAME: &str = "literal"; diff --git a/crates/polars-plan/src/dsl/expr.rs b/crates/polars-plan/src/dsl/expr.rs index c14e8d339e28..b91deca57e9d 100644 --- a/crates/polars-plan/src/dsl/expr.rs +++ b/crates/polars-plan/src/dsl/expr.rs @@ -135,8 +135,7 @@ pub enum Expr { Exclude(Box, Vec), /// Set root name as Alias KeepName(Box), - /// Special case that does not need columns - Count, + Len, /// Take the nth column in the `DataFrame` Nth(i64), // skipped fields must be last otherwise serde fails in pickle @@ -223,7 +222,7 @@ impl Hash for Expr { options.hash(state); }, // already hashed by discriminant - Expr::Wildcard | Expr::Count => {}, + Expr::Wildcard | Expr::Len => {}, #[allow(unreachable_code)] _ => { // the panic checks if we hit this diff --git a/crates/polars-plan/src/dsl/functions/index.rs b/crates/polars-plan/src/dsl/functions/index.rs index 6d0d2528b46f..20e7245d4021 100644 --- a/crates/polars-plan/src/dsl/functions/index.rs +++ b/crates/polars-plan/src/dsl/functions/index.rs @@ -8,7 +8,7 @@ use super::*; pub fn arg_sort_by>(by: E, descending: &[bool]) -> Expr { let e = &by.as_ref()[0]; let name = expr_output_name(e).unwrap(); - int_range(lit(0 as IdxSize), count().cast(IDX_DTYPE), 1, IDX_DTYPE) + int_range(lit(0 as IdxSize), len().cast(IDX_DTYPE), 1, IDX_DTYPE) .sort_by(by, descending) .alias(name.as_ref()) } diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index d6356a8a0aed..516e45dee066 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1760,21 +1760,9 @@ where } } -/// Count expression. -pub fn count() -> Expr { - Expr::Count -} - -/// Return the cumulative count of the context. -#[cfg(feature = "range")] -pub fn cum_count(reverse: bool) -> Expr { - let start = lit(1 as IdxSize); - let end = count() + lit(1 as IdxSize); - let mut range = int_range(start, end, 1, IDX_DTYPE); - if reverse { - range = range.reverse() - } - range.alias("cum_count") +/// Return the number of rows in the context. +pub fn len() -> Expr { + Expr::Len } /// First column in DataFrame. diff --git a/crates/polars-plan/src/logical_plan/aexpr/mod.rs b/crates/polars-plan/src/logical_plan/aexpr/mod.rs index 50b00067025e..704d3fc1d1c7 100644 --- a/crates/polars-plan/src/logical_plan/aexpr/mod.rs +++ b/crates/polars-plan/src/logical_plan/aexpr/mod.rs @@ -15,7 +15,7 @@ use crate::dsl::function_expr::FunctionExpr; #[cfg(feature = "cse")] use crate::logical_plan::visitor::AexprNode; use crate::logical_plan::Context; -use crate::prelude::consts::COUNT; +use crate::prelude::consts::LEN; use crate::prelude::*; #[derive(Clone, Debug, IntoStaticStr)] @@ -188,7 +188,7 @@ pub enum AExpr { offset: Node, length: Node, }, - Count, + Len, Nth(i64), } @@ -224,7 +224,7 @@ impl AExpr { | SortBy { .. } | Agg { .. } | Window { .. } - | Count + | Len | Slice { .. } | Gather { .. } | Nth(_) @@ -259,7 +259,7 @@ impl AExpr { use AExpr::*; match self { - Nth(_) | Column(_) | Literal(_) | Wildcard | Count => {}, + Nth(_) | Column(_) | Literal(_) | Wildcard | Len => {}, Alias(e, _) => container.push(*e), BinaryExpr { left, op: _, right } => { // reverse order so that left is popped first @@ -338,7 +338,7 @@ impl AExpr { pub(crate) fn replace_inputs(mut self, inputs: &[Node]) -> Self { use AExpr::*; let input = match &mut self { - Column(_) | Literal(_) | Wildcard | Count | Nth(_) => return self, + Column(_) | Literal(_) | Wildcard | Len | Nth(_) => return self, Alias(input, _) => input, Cast { expr, .. } => expr, Explode(input) => input, @@ -420,7 +420,7 @@ impl AExpr { pub(crate) fn is_leaf(&self) -> bool { matches!( self, - AExpr::Column(_) | AExpr::Literal(_) | AExpr::Count | AExpr::Nth(_) + AExpr::Column(_) | AExpr::Literal(_) | AExpr::Len | AExpr::Nth(_) ) } } diff --git a/crates/polars-plan/src/logical_plan/aexpr/schema.rs b/crates/polars-plan/src/logical_plan/aexpr/schema.rs index 282c1009ac69..6049f49695de 100644 --- a/crates/polars-plan/src/logical_plan/aexpr/schema.rs +++ b/crates/polars-plan/src/logical_plan/aexpr/schema.rs @@ -17,7 +17,7 @@ impl AExpr { use AExpr::*; use DataType::*; match self { - Count => Ok(Field::new(COUNT, IDX_DTYPE)), + Len => Ok(Field::new(LEN, IDX_DTYPE)), Window { function, .. } => { let e = arena.get(*function); e.to_field(schema, ctxt, arena) diff --git a/crates/polars-plan/src/logical_plan/conversion.rs b/crates/polars-plan/src/logical_plan/conversion.rs index fec74b7ecdb0..15b02e70522e 100644 --- a/crates/polars-plan/src/logical_plan/conversion.rs +++ b/crates/polars-plan/src/logical_plan/conversion.rs @@ -151,7 +151,7 @@ pub fn to_aexpr(expr: Expr, arena: &mut Arena) -> Node { length: to_aexpr(*length, arena), }, Expr::Wildcard => AExpr::Wildcard, - Expr::Count => AExpr::Count, + Expr::Len => AExpr::Len, Expr::Nth(i) => AExpr::Nth(i), Expr::SubPlan { .. } => panic!("no SQLSubquery expected at this point"), Expr::KeepName(_) => panic!("no `name.keep` expected at this point"), @@ -598,7 +598,7 @@ pub fn node_to_expr(node: Node, expr_arena: &Arena) -> Expr { offset: Box::new(node_to_expr(offset, expr_arena)), length: Box::new(node_to_expr(length, expr_arena)), }, - AExpr::Count => Expr::Count, + AExpr::Len => Expr::Len, AExpr::Nth(i) => Expr::Nth(i), AExpr::Wildcard => Expr::Wildcard, } diff --git a/crates/polars-plan/src/logical_plan/format.rs b/crates/polars-plan/src/logical_plan/format.rs index ff22c7345938..413818c46478 100644 --- a/crates/polars-plan/src/logical_plan/format.rs +++ b/crates/polars-plan/src/logical_plan/format.rs @@ -269,7 +269,7 @@ impl Debug for Expr { }, }, Nth(i) => write!(f, "nth({i})"), - Count => write!(f, "count()"), + Len => write!(f, "len()"), Explode(expr) => write!(f, "{expr:?}.explode()"), Alias(expr, name) => write!(f, "{expr:?}.alias(\"{name}\")"), Column(name) => write!(f, "col(\"{name}\")"), diff --git a/crates/polars-plan/src/logical_plan/iterator.rs b/crates/polars-plan/src/logical_plan/iterator.rs index 33296118ce31..e83952476d82 100644 --- a/crates/polars-plan/src/logical_plan/iterator.rs +++ b/crates/polars-plan/src/logical_plan/iterator.rs @@ -6,7 +6,7 @@ macro_rules! push_expr { ($current_expr:expr, $push:ident, $iter:ident) => {{ use Expr::*; match $current_expr { - Nth(_) | Column(_) | Literal(_) | Wildcard | Columns(_) | DtypeColumn(_) | Count => {}, + Nth(_) | Column(_) | Literal(_) | Wildcard | Columns(_) | DtypeColumn(_) | Len => {}, Alias(e, _) => $push(e), BinaryExpr { left, op: _, right } => { // reverse order so that left is popped first diff --git a/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs b/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs index 0b72b1a0073d..3e7891bc1c66 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs @@ -317,7 +317,7 @@ impl ExprIdentifierVisitor<'_> { // TODO! Add a typed null AExpr::Literal(LiteralValue::Null) => REFUSE_NO_MEMBER, AExpr::Column(_) | AExpr::Literal(_) | AExpr::Alias(_, _) => REFUSE_ALLOW_MEMBER, - AExpr::Count => { + AExpr::Len => { if self.is_group_by { REFUSE_NO_MEMBER } else { diff --git a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/group_by.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/group_by.rs index 3ab423a6640d..cca7d1c3f1ac 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/group_by.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/group_by.rs @@ -52,7 +52,7 @@ pub(super) fn process_group_by( // Counts change due to groupby's // TODO! handle aliases, so that the predicate that is pushed down refers to the column before alias. let mut push_down = !has_aexpr(*predicate, expr_arena, |ae| { - matches!(ae, AExpr::Count | AExpr::Alias(_, _)) + matches!(ae, AExpr::Len | AExpr::Alias(_, _)) }); for name in aexpr_to_leaf_names_iter(*predicate, expr_arena) { diff --git a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs index 4d9cbc2b46df..c9f08519ffb4 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs @@ -530,7 +530,7 @@ impl<'a> PredicatePushDown<'a> { // a count is influenced by a Union/Vstack acc_predicates.retain(|_, predicate| { - if has_aexpr(*predicate, expr_arena, |ae| matches!(ae, AExpr::Count)) { + if has_aexpr(*predicate, expr_arena, |ae| matches!(ae, AExpr::Len)) { local_predicates.push(*predicate); false } else { diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs index 8cf418011888..8e714a3a40ca 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs @@ -3,7 +3,7 @@ use super::*; fn is_count(node: Node, expr_arena: &Arena) -> bool { match expr_arena.get(node) { AExpr::Alias(node, _) => is_count(*node, expr_arena), - AExpr::Count => true, + AExpr::Len => true, _ => false, } } diff --git a/crates/polars-plan/src/logical_plan/tree_format.rs b/crates/polars-plan/src/logical_plan/tree_format.rs index d50d0432b756..00b49fd7843f 100644 --- a/crates/polars-plan/src/logical_plan/tree_format.rs +++ b/crates/polars-plan/src/logical_plan/tree_format.rs @@ -53,7 +53,7 @@ impl UpperExp for AExpr { AExpr::Window { .. } => "window", AExpr::Wildcard => "*", AExpr::Slice { .. } => "slice", - AExpr::Count => "count", + AExpr::Len => "len", AExpr::Nth(v) => return write!(f, "nth({})", v), }; diff --git a/crates/polars-plan/src/logical_plan/visitor/expr.rs b/crates/polars-plan/src/logical_plan/visitor/expr.rs index 5dc3cc1cd6f7..36a7f3032546 100644 --- a/crates/polars-plan/src/logical_plan/visitor/expr.rs +++ b/crates/polars-plan/src/logical_plan/visitor/expr.rs @@ -153,7 +153,7 @@ impl AexprNode { (Gather { .. }, Gather { .. }) | (Filter { .. }, Filter { .. }) | (Ternary { .. }, Ternary { .. }) - | (Count, Count) + | (Len, Len) | (Slice { .. }, Slice { .. }) | (Explode(_), Explode(_)) => true, (SortBy { descending: l, .. }, SortBy { descending: r, .. }) => l == r, diff --git a/crates/polars-plan/src/utils.rs b/crates/polars-plan/src/utils.rs index 14ed716a0adc..523f3340d2ea 100644 --- a/crates/polars-plan/src/utils.rs +++ b/crates/polars-plan/src/utils.rs @@ -7,7 +7,7 @@ use smartstring::alias::String as SmartString; use crate::logical_plan::iterator::ArenaExprIter; use crate::logical_plan::Context; -use crate::prelude::consts::{COUNT, LITERAL_NAME}; +use crate::prelude::consts::{LEN, LITERAL_NAME}; use crate::prelude::*; /// Utility to write comma delimited strings @@ -178,7 +178,7 @@ pub fn expr_output_name(expr: &Expr) -> PolarsResult> { ComputeError: "this expression may produce multiple output names" ), - Expr::Count => return Ok(Arc::from(COUNT)), + Expr::Len => return Ok(Arc::from(LEN)), Expr::Literal(val) => { return match val { LiteralValue::Series(s) => Ok(Arc::from(s.name())), @@ -204,7 +204,7 @@ pub(crate) fn get_single_leaf(expr: &Expr) -> PolarsResult> { Expr::SortBy { expr, .. } => return get_single_leaf(expr), Expr::Window { function, .. } => return get_single_leaf(function), Expr::Column(name) => return Ok(name.clone()), - Expr::Count => return Ok(Arc::from(COUNT)), + Expr::Len => return Ok(Arc::from(LEN)), _ => {}, } } diff --git a/crates/polars-sql/src/functions.rs b/crates/polars-sql/src/functions.rs index bc3138b006d3..86f4734f5944 100644 --- a/crates/polars-sql/src/functions.rs +++ b/crates/polars-sql/src/functions.rs @@ -1,6 +1,6 @@ use polars_core::prelude::{polars_bail, polars_err, PolarsResult}; use polars_lazy::dsl::Expr; -use polars_plan::dsl::{coalesce, concat_str, count, when}; +use polars_plan::dsl::{coalesce, concat_str, len, when}; use polars_plan::logical_plan::LiteralValue; use polars_plan::prelude::LiteralValue::Null; use polars_plan::prelude::{lit, StrptimeOptions}; @@ -1137,7 +1137,7 @@ impl SQLFunctionVisitor<'_> { let args = extract_args(self.func); match (self.func.distinct, args.as_slice()) { // count() - (false, []) => Ok(count()), + (false, []) => Ok(len()), // count(column_name) (false, [FunctionArgExpr::Expr(sql_expr)]) => { let expr = parse_sql_expr(sql_expr, self.ctx)?; @@ -1145,7 +1145,7 @@ impl SQLFunctionVisitor<'_> { Ok(expr.count()) }, // count(*) - (false, [FunctionArgExpr::Wildcard]) => Ok(count()), + (false, [FunctionArgExpr::Wildcard]) => Ok(len()), // count(distinct column_name) (true, [FunctionArgExpr::Expr(sql_expr)]) => { let expr = parse_sql_expr(sql_expr, self.ctx)?; diff --git a/crates/polars/tests/it/lazy/expressions/apply.rs b/crates/polars/tests/it/lazy/expressions/apply.rs index 749a9e21aa50..e4996920460a 100644 --- a/crates/polars/tests/it/lazy/expressions/apply.rs +++ b/crates/polars/tests/it/lazy/expressions/apply.rs @@ -9,7 +9,7 @@ fn test_int_range_agg() -> PolarsResult<()> { let out = df .lazy() - .with_columns([int_range(lit(0i32), count(), 1, DataType::Int64).over([col("x")])]) + .with_columns([int_range(lit(0i32), len(), 1, DataType::Int64).over([col("x")])]) .collect()?; assert_eq!( Vec::from_iter(out.column("literal")?.i64()?.into_no_null_iter()), diff --git a/crates/polars/tests/it/lazy/expressions/arity.rs b/crates/polars/tests/it/lazy/expressions/arity.rs index 095a7374b925..f51164b2862c 100644 --- a/crates/polars/tests/it/lazy/expressions/arity.rs +++ b/crates/polars/tests/it/lazy/expressions/arity.rs @@ -11,7 +11,7 @@ fn test_list_broadcast() { .unwrap() .lazy() .group_by([col("g")]) - .agg([col("a").unique_counts() * count()]) + .agg([col("a").unique_counts() * len()]) .collect() .unwrap(); } diff --git a/crates/polars/tests/it/lazy/expressions/slice.rs b/crates/polars/tests/it/lazy/expressions/slice.rs index 55e373352463..3cc1010d5bb5 100644 --- a/crates/polars/tests/it/lazy/expressions/slice.rs +++ b/crates/polars/tests/it/lazy/expressions/slice.rs @@ -15,7 +15,7 @@ fn test_slice_args() -> PolarsResult<()> { ]? .lazy() .group_by_stable([col("groups")]) - .agg([col("vals").slice(lit(0i64), count() * lit(0.2))]) + .agg([col("vals").slice(lit(0i64), len() * lit(0.2))]) .collect()?; let out = df.column("vals")?.explode()?; diff --git a/crates/polars/tests/it/lazy/expressions/window.rs b/crates/polars/tests/it/lazy/expressions/window.rs index 460a0a57b149..32a46fe01929 100644 --- a/crates/polars/tests/it/lazy/expressions/window.rs +++ b/crates/polars/tests/it/lazy/expressions/window.rs @@ -167,7 +167,7 @@ fn test_literal_window_fn() -> PolarsResult<()> { let out = df .lazy() - .select([repeat(1, count()) + .select([repeat(1, len()) .cum_sum(false) .over_with_options([col("chars")], WindowMapping::Join) .alias("foo")]) diff --git a/crates/polars/tests/it/lazy/predicate_queries.rs b/crates/polars/tests/it/lazy/predicate_queries.rs index 24f32546aee5..2af8a099e46e 100644 --- a/crates/polars/tests/it/lazy/predicate_queries.rs +++ b/crates/polars/tests/it/lazy/predicate_queries.rs @@ -221,7 +221,7 @@ fn test_count_blocked_at_union_3963() -> PolarsResult<()> { ..Default::default() }, )? - .filter(count().over([col("k")]).gt(lit(1))) + .filter(len().over([col("k")]).gt(lit(1))) .collect()?; assert!(out.equals(&expected)); diff --git a/docs/src/python/user-guide/basics/expressions.py b/docs/src/python/user-guide/basics/expressions.py index 590c8db1688d..12c6ea2170ec 100644 --- a/docs/src/python/user-guide/basics/expressions.py +++ b/docs/src/python/user-guide/basics/expressions.py @@ -60,7 +60,7 @@ # --8<-- [end:dataframe2] # --8<-- [start:group_by] -df2.group_by("y", maintain_order=True).count() +df2.group_by("y", maintain_order=True).len() # --8<-- [end:group_by] # --8<-- [start:group_by2] diff --git a/docs/src/python/user-guide/expressions/aggregation.py b/docs/src/python/user-guide/expressions/aggregation.py index cfcd9970573b..e25917b2de38 100644 --- a/docs/src/python/user-guide/expressions/aggregation.py +++ b/docs/src/python/user-guide/expressions/aggregation.py @@ -24,11 +24,11 @@ dataset.lazy() .group_by("first_name") .agg( - pl.count(), + pl.len(), pl.col("gender"), pl.first("last_name"), ) - .sort("count", descending=True) + .sort("len", descending=True) .limit(5) ) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 16f0da8dca76..e0658b2d36a4 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -43,7 +43,7 @@ def add_counter(val: int) -> int: out = df.select( pl.col("values").map_elements(add_counter).alias("solution_map_elements"), - (pl.col("values") + pl.int_range(1, pl.count() + 1)).alias("solution_expr"), + (pl.col("values") + pl.int_range(1, pl.len() + 1)).alias("solution_expr"), ) print(out) # --8<-- [end:counter] diff --git a/docs/src/python/user-guide/io/multiple.py b/docs/src/python/user-guide/io/multiple.py index f7500b6b6684..a718c5cd1588 100644 --- a/docs/src/python/user-guide/io/multiple.py +++ b/docs/src/python/user-guide/io/multiple.py @@ -28,12 +28,13 @@ # --8<-- [end:graph] # --8<-- [start:glob] -import polars as pl import glob +import polars as pl + queries = [] for file in glob.glob("docs/data/my_many_files_*.csv"): - q = pl.scan_csv(file).group_by("bar").agg([pl.count(), pl.sum("foo")]) + q = pl.scan_csv(file).group_by("bar").agg(pl.len(), pl.sum("foo")) queries.append(q) dataframes = pl.collect_all(queries) diff --git a/docs/src/python/user-guide/transformations/time-series/rolling.py b/docs/src/python/user-guide/transformations/time-series/rolling.py index 0a65cbc195fd..f34f56ee6d36 100644 --- a/docs/src/python/user-guide/transformations/time-series/rolling.py +++ b/docs/src/python/user-guide/transformations/time-series/rolling.py @@ -1,7 +1,8 @@ # --8<-- [start:setup] -import polars as pl from datetime import date, datetime +import polars as pl + # --8<-- [end:setup] # --8<-- [start:df] @@ -60,10 +61,6 @@ closed="both", by="groups", include_boundaries=True, -).agg( - [ - pl.count(), - ] -) +).agg(pl.len()) print(out) # --8<-- [end:group_by_dyn2] diff --git a/docs/src/rust/user-guide/basics/expressions.rs b/docs/src/rust/user-guide/basics/expressions.rs index 59e5c9338add..757c52e3939f 100644 --- a/docs/src/rust/user-guide/basics/expressions.rs +++ b/docs/src/rust/user-guide/basics/expressions.rs @@ -99,12 +99,7 @@ fn main() -> Result<(), Box> { // --8<-- [end:dataframe2] // --8<-- [start:group_by] - let out = df2 - .clone() - .lazy() - .group_by(["y"]) - .agg([count()]) - .collect()?; + let out = df2.clone().lazy().group_by(["y"]).agg([len()]).collect()?; println!("{}", out); // --8<-- [end:group_by] diff --git a/docs/src/rust/user-guide/expressions/aggregation.rs b/docs/src/rust/user-guide/expressions/aggregation.rs index 2e061ac8e15a..532b89db9482 100644 --- a/docs/src/rust/user-guide/expressions/aggregation.rs +++ b/docs/src/rust/user-guide/expressions/aggregation.rs @@ -47,9 +47,9 @@ fn main() -> Result<(), Box> { .clone() .lazy() .group_by(["first_name"]) - .agg([count(), col("gender"), col("last_name").first()]) + .agg([len(), col("gender"), col("last_name").first()]) .sort( - "count", + "len", SortOptions { descending: true, nulls_last: true, diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs index 7a1238154593..502f423fdf0d 100644 --- a/docs/src/rust/user-guide/expressions/structs.rs +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -1,5 +1,5 @@ // --8<-- [start:setup] -use polars::lazy::dsl::count; +use polars::lazy::dsl::len; use polars::prelude::*; // --8<-- [end:setup] fn main() -> Result<(), Box> { @@ -69,7 +69,7 @@ fn main() -> Result<(), Box> { // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated()) // Error: .is_duplicated() not available if you try that // https://github.com/pola-rs/polars/issues/3803 - .filter(count().over([col("Movie"), col("Theatre")]).gt(lit(1))) + .filter(len().over([col("Movie"), col("Theatre")]).gt(lit(1))) .collect()?; println!("{}", &out); // --8<-- [end:struct_duplicates] @@ -91,7 +91,7 @@ fn main() -> Result<(), Box> { // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated()) // Error: .is_duplicated() not available if you try that // https://github.com/pola-rs/polars/issues/3803 - .filter(count().over([col("Movie"), col("Theatre")]).gt(lit(1))) + .filter(len().over([col("Movie"), col("Theatre")]).gt(lit(1))) .collect()?; println!("{}", &out); // --8<-- [end:struct_ranking] diff --git a/docs/src/rust/user-guide/transformations/time-series/rolling.rs b/docs/src/rust/user-guide/transformations/time-series/rolling.rs index 5f5533d302ce..fc81f34412bb 100644 --- a/docs/src/rust/user-guide/transformations/time-series/rolling.rs +++ b/docs/src/rust/user-guide/transformations/time-series/rolling.rs @@ -140,7 +140,7 @@ fn main() -> Result<(), Box> { ..Default::default() }, ) - .agg([count()]) + .agg([len()]) .collect()?; println!("{}", &out); // --8<-- [end:group_by_dyn2] diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 5893300d0aee..2ae0aacad752 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -138,6 +138,7 @@ int_range, int_ranges, last, + len, lit, map, map_batches, @@ -393,6 +394,8 @@ "tail", "time", # named time_, see import above "var", + # polars.functions.len + "len", # polars.functions.random "set_random_seed", # polars.convert diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index b9863f13e943..cad0ab8026cb 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5272,10 +5272,10 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> Self: └──────┴─────┴─────┘ An index column can also be created using the expressions :func:`int_range` - and :func:`count`. + and :func:`len`. >>> df.select( - ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), ... pl.all(), ... ) shape: (3, 3) @@ -7260,9 +7260,8 @@ def pivot( - None: no aggregation takes place, will raise error if multiple values are in group. - A predefined aggregate function string, one of - {'first', 'sum', 'max', 'min', 'mean', 'median', 'last', 'count'} + {'min', 'max', 'first', 'last', 'sum', 'mean', 'median', 'len'} - An expression to do the aggregation. - maintain_order Sort the grouped keys so that the output order is predictable. sort_columns @@ -7392,8 +7391,15 @@ def pivot( aggregate_expr = F.element().median()._pyexpr elif aggregate_function == "last": aggregate_expr = F.element().last()._pyexpr + elif aggregate_function == "len": + aggregate_expr = F.len()._pyexpr elif aggregate_function == "count": - aggregate_expr = F.count()._pyexpr + issue_deprecation_warning( + "`aggregate_function='count'` input for `pivot` is deprecated." + " Please use `aggregate_function='len'`.", + version="0.20.5", + ) + aggregate_expr = F.len()._pyexpr else: msg = f"invalid input for `aggregate_function` argument: {aggregate_function!r}" raise ValueError(msg) diff --git a/py-polars/polars/dataframe/group_by.py b/py-polars/polars/dataframe/group_by.py index 32668730cace..fa11d5a65946 100644 --- a/py-polars/polars/dataframe/group_by.py +++ b/py-polars/polars/dataframe/group_by.py @@ -305,7 +305,7 @@ def map_groups(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: It is better to implement this with an expression: >>> df.filter( - ... pl.int_range(0, pl.count()).shuffle().over("color") < 2 + ... pl.int_range(pl.len()).shuffle().over("color") < 2 ... ) # doctest: +IGNORE_RESULT """ by: list[str] @@ -452,6 +452,32 @@ def all(self) -> DataFrame: """ return self.agg(F.all()) + def len(self) -> DataFrame: + """ + Return the number of rows in each group. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["apple", "apple", "orange"], + ... "b": [1, None, 2], + ... } + ... ) + >>> df.group_by("a").len() # doctest: +SKIP + shape: (2, 2) + ┌────────┬─────┐ + │ a ┆ len │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞════════╪═════╡ + │ apple ┆ 2 │ + │ orange ┆ 1 │ + └────────┴─────┘ + """ + return self.agg(F.len()) + + @deprecate_renamed_function("len", version="0.20.5") def count(self) -> DataFrame: """ Return the number of rows in each group. @@ -477,7 +503,7 @@ def count(self) -> DataFrame: │ orange ┆ 1 │ └────────┴───────┘ """ - return self.agg(F.count()) + return self.agg(F.len().alias("count")) def first(self) -> DataFrame: """ diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py index 5ae3e03a0d80..4c5e0eb2eb0c 100644 --- a/py-polars/polars/expr/meta.py +++ b/py-polars/polars/expr/meta.py @@ -130,11 +130,11 @@ def output_name(self, *, raise_if_undetermined: bool = True) -> str | None: >>> e_sum_over = pl.sum("foo").over("groups") >>> e_sum_over.meta.output_name() 'foo' - >>> e_sum_slice = pl.sum("foo").slice(pl.count() - 10, pl.col("bar")) + >>> e_sum_slice = pl.sum("foo").slice(pl.len() - 10, pl.col("bar")) >>> e_sum_slice.meta.output_name() 'foo' - >>> pl.count().meta.output_name() - 'count' + >>> pl.len().meta.output_name() + 'len' """ try: return self._pyexpr.meta_output_name() @@ -180,7 +180,7 @@ def root_names(self) -> list[str]: >>> e_sum_over = pl.sum("foo").over("groups") >>> e_sum_over.meta.root_names() ['foo', 'groups'] - >>> e_sum_slice = pl.sum("foo").slice(pl.count() - 10, pl.col("bar")) + >>> e_sum_slice = pl.sum("foo").slice(pl.len() - 10, pl.col("bar")) >>> e_sum_slice.meta.root_names() ['foo', 'bar'] """ diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py index 89ac7434dfb9..22403b09ca77 100644 --- a/py-polars/polars/functions/__init__.py +++ b/py-polars/polars/functions/__init__.py @@ -69,6 +69,7 @@ tail, var, ) +from polars.functions.len import len from polars.functions.lit import lit from polars.functions.random import set_random_seed from polars.functions.range import ( @@ -168,6 +169,8 @@ "tail", "time", "var", + # polars.functions.len + "len", # polars.functions.whenthen "when", "sql_expr", diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 4d5ea44eb890..e723ca8c43fb 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -5,7 +5,7 @@ import polars._reexport as pl import polars.functions as F -from polars.datatypes import DTYPE_TEMPORAL_UNITS, Date, Datetime, Int64 +from polars.datatypes import DTYPE_TEMPORAL_UNITS, Date, Datetime, Int64, UInt32 from polars.utils._async import _AioDataFrameResult, _GeventDataFrameResult from polars.utils._parse_expr_input import ( parse_as_expression, @@ -15,6 +15,7 @@ from polars.utils.deprecation import ( deprecate_parameter_as_positional, deprecate_renamed_function, + issue_deprecation_warning, ) with contextlib.suppress(ImportError): # Module not available when building docs @@ -90,12 +91,13 @@ def element() -> Expr: @deprecate_parameter_as_positional("column", version="0.20.4") def count(*columns: str) -> Expr: """ - Either return the number of rows in the context, or return the number of non-null values in the column. + Return the number of non-null values in the column. - If no arguments are passed, returns the number of rows in the context; note that rows - containing null values count towards the total (this is similar to `COUNT(*)` in SQL). + This function is syntactic sugar for `col(columns).count()`. - Otherwise, this function is syntactic sugar for `col(column).count()`. + Calling this function without any arguments returns the number of rows in the + context. **This way of using the function is deprecated. Please use :func:`len` + instead.** Parameters ---------- @@ -113,9 +115,6 @@ def count(*columns: str) -> Expr: Examples -------- - Return the number of rows in a context. Note that rows containing null values are - counted towards the total. - >>> df = pl.DataFrame( ... { ... "a": [1, 2, None], @@ -123,18 +122,6 @@ def count(*columns: str) -> Expr: ... "c": ["foo", "bar", "foo"], ... } ... ) - >>> df.select(pl.count()) - shape: (1, 1) - ┌───────┐ - │ count │ - │ --- │ - │ u32 │ - ╞═══════╡ - │ 3 │ - └───────┘ - - Return the number of non-null values in a column. - >>> df.select(pl.count("a")) shape: (1, 1) ┌─────┐ @@ -157,38 +144,37 @@ def count(*columns: str) -> Expr: │ 1 ┆ 3 │ └─────┴─────┘ - Generate an index column using `count` in conjunction with :func:`int_range`. - - >>> df.select( - ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), - ... pl.all(), - ... ) - shape: (3, 4) - ┌───────┬──────┬──────┬─────┐ - │ index ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 ┆ str │ - ╞═══════╪══════╪══════╪═════╡ - │ 0 ┆ 1 ┆ 3 ┆ foo │ - │ 1 ┆ 2 ┆ null ┆ bar │ - │ 2 ┆ null ┆ null ┆ foo │ - └───────┴──────┴──────┴─────┘ + Return the number of rows in a context. **This way of using the function is + deprecated. Please use :func:`len` instead.** - """ # noqa: W505 + >>> df.select(pl.count()) # doctest: +SKIP + shape: (1, 1) + ┌───────┐ + │ count │ + │ --- │ + │ u32 │ + ╞═══════╡ + │ 3 │ + └───────┘ + """ if not columns: - return wrap_expr(plr.count()) + issue_deprecation_warning( + "`pl.count()` is deprecated. Please use `pl.len()` instead.", + version="0.20.5", + ) + return F.len().alias("count") return F.col(*columns).count() def cum_count(*columns: str, reverse: bool = False) -> Expr: """ - Return the cumulative count of the non-null values in the column or of the context. + Return the cumulative count of the non-null values in the column. + + This function is syntactic sugar for `col(columns).cum_count()`. If no arguments are passed, returns the cumulative count of a context. Rows containing null values count towards the result. - Otherwise, this function is syntactic sugar for `col(names).cum_count()`. - Parameters ---------- *columns @@ -198,24 +184,7 @@ def cum_count(*columns: str, reverse: bool = False) -> Expr: Examples -------- - Return the row numbers of a context. Note that rows containing null values are - counted towards the total. - >>> df = pl.DataFrame({"a": [1, 2, None], "b": [3, None, None]}) - >>> df.select(pl.cum_count()) - shape: (3, 1) - ┌───────────┐ - │ cum_count │ - │ --- │ - │ u32 │ - ╞═══════════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └───────────┘ - - Return the cumulative count of non-null values in a column. - >>> df.select(pl.cum_count("a")) shape: (3, 1) ┌─────┐ @@ -227,23 +196,18 @@ def cum_count(*columns: str, reverse: bool = False) -> Expr: │ 2 │ │ 2 │ └─────┘ - - Add row numbers to a DataFrame. - - >>> df.select(pl.cum_count().alias("row_number"), pl.all()) - shape: (3, 3) - ┌────────────┬──────┬──────┐ - │ row_number ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════════╪══════╪══════╡ - │ 1 ┆ 1 ┆ 3 │ - │ 2 ┆ 2 ┆ null │ - │ 3 ┆ null ┆ null │ - └────────────┴──────┴──────┘ """ if not columns: - return wrap_expr(plr.cum_count(reverse=reverse)) + issue_deprecation_warning( + "`pl.cum_count()` is deprecated. The same result can be achieved using" + " `pl.int_range(1, pl.len() + 1, dtype=pl.UInt32)`," + " or `int_range(pl.len(), 0, -1, dtype=pl.UInt32)` when `reverse=True`.", + version="0.20.5", + ) + if reverse: + return F.int_range(F.len(), 0, step=-1, dtype=UInt32).alias("cum_count") + else: + return F.int_range(1, F.len() + 1, dtype=UInt32).alias("cum_count") return F.col(*columns).cum_count(reverse=reverse) diff --git a/py-polars/polars/functions/len.py b/py-polars/polars/functions/len.py new file mode 100644 index 000000000000..f34a3e84cbe2 --- /dev/null +++ b/py-polars/polars/functions/len.py @@ -0,0 +1,67 @@ +""" +Module containing the `len` function. + +Keep this function in its own module to avoid conflicts with Python's built-in `len`. +""" +from __future__ import annotations + +import contextlib +from typing import TYPE_CHECKING + +from polars.utils._wrap import wrap_expr + +with contextlib.suppress(ImportError): # Module not available when building docs + import polars.polars as plr + +if TYPE_CHECKING: + from polars import Expr + + +def len() -> Expr: + """ + Return the number of rows in the context. + + This is similar to `COUNT(*)` in SQL. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [3, None, None], + ... "c": ["foo", "bar", "foo"], + ... } + ... ) + >>> df.select(pl.len()) + shape: (1, 1) + ┌─────┐ + │ len │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + └─────┘ + + Generate an index column by using `len` in conjunction with :func:`int_range`. + + >>> df.select( + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ) + shape: (3, 4) + ┌───────┬──────┬──────┬─────┐ + │ index ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 ┆ str │ + ╞═══════╪══════╪══════╪═════╡ + │ 0 ┆ 1 ┆ 3 ┆ foo │ + │ 1 ┆ 2 ┆ null ┆ bar │ + │ 2 ┆ null ┆ null ┆ foo │ + └───────┴──────┴──────┴─────┘ + """ + return wrap_expr(plr.len()) diff --git a/py-polars/polars/functions/range/int_range.py b/py-polars/polars/functions/range/int_range.py index 91518fd5f816..96633efe97c2 100644 --- a/py-polars/polars/functions/range/int_range.py +++ b/py-polars/polars/functions/range/int_range.py @@ -198,11 +198,11 @@ def int_range( 2 ] - Generate an index column using `int_range` in conjunction with :func:`count`. + Generate an index column by using `int_range` in conjunction with :func:`len`. >>> df = pl.DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]}) >>> df.select( - ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), ... pl.all(), ... ) shape: (3, 3) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index b7a93a929809..fccfa642817e 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4616,10 +4616,10 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> Self: └──────┴─────┴─────┘ An index column can also be created using the expressions :func:`int_range` - and :func:`count`. + and :func:`len`. >>> lf.select( - ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), ... pl.all(), ... ).collect() shape: (3, 3) diff --git a/py-polars/polars/lazyframe/group_by.py b/py-polars/polars/lazyframe/group_by.py index 21ae284fef43..b8e3aa588c7c 100644 --- a/py-polars/polars/lazyframe/group_by.py +++ b/py-polars/polars/lazyframe/group_by.py @@ -208,11 +208,9 @@ def map_groups( It is better to implement this with an expression: - >>> ( - ... df.lazy() - ... .filter(pl.int_range(0, pl.count()).shuffle().over("color") < 2) - ... .collect() - ... ) # doctest: +IGNORE_RESULT + >>> df.lazy().filter( + ... pl.int_range(pl.len()).shuffle().over("color") < 2 + ... ).collect() # doctest: +IGNORE_RESULT """ return wrap_ldf(self.lgb.map_groups(function, schema)) @@ -335,6 +333,34 @@ def all(self) -> LazyFrame: """ return self.agg(F.all()) + def len(self) -> LazyFrame: + """ + Return the number of rows in each group. + + Rows containing null values count towards the total. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["apple", "apple", "orange"], + ... "b": [1, None, 2], + ... } + ... ) + >>> lf.group_by("a").count().collect() # doctest: +SKIP + shape: (2, 2) + ┌────────┬───────┐ + │ a ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞════════╪═══════╡ + │ apple ┆ 2 │ + │ orange ┆ 1 │ + └────────┴───────┘ + """ + return self.agg(F.len()) + + @deprecate_renamed_function("len", version="0.20.5") def count(self) -> LazyFrame: """ Return the number of rows in each group. @@ -360,7 +386,7 @@ def count(self) -> LazyFrame: │ orange ┆ 1 │ └────────┴───────┘ """ - return self.agg(F.count()) + return self.agg(F.len().alias("count")) def first(self) -> LazyFrame: """ diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py index 7570718192de..4e00664d42c7 100644 --- a/py-polars/polars/type_aliases.py +++ b/py-polars/polars/type_aliases.py @@ -100,7 +100,7 @@ "lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd" ] PivotAgg: TypeAlias = Literal[ - "first", "sum", "max", "min", "mean", "median", "last", "count" + "min", "max", "first", "last", "sum", "mean", "median", "len" ] RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"] SizeUnit: TypeAlias = Literal[ diff --git a/py-polars/src/functions/lazy.rs b/py-polars/src/functions/lazy.rs index efcfea999bf6..bedcf6739cc7 100644 --- a/py-polars/src/functions/lazy.rs +++ b/py-polars/src/functions/lazy.rs @@ -183,13 +183,8 @@ pub fn concat_str(s: Vec, separator: &str) -> PyExpr { } #[pyfunction] -pub fn count() -> PyExpr { - dsl::count().into() -} - -#[pyfunction] -pub fn cum_count(reverse: bool) -> PyExpr { - dsl::cum_count(reverse).into() +pub fn len() -> PyExpr { + dsl::len().into() } #[pyfunction] diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index 60f68283e418..86b88ff65a31 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -148,10 +148,8 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::concat_str)) .unwrap(); - m.add_wrapped(wrap_pyfunction!(functions::count)).unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::len)).unwrap(); m.add_wrapped(wrap_pyfunction!(functions::cov)).unwrap(); - m.add_wrapped(wrap_pyfunction!(functions::cum_count)) - .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::cum_fold)) .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::cum_reduce)) diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 039ad3013a51..2b6d965734da 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1774,9 +1774,9 @@ def __repr__(self) -> str: def test_group_by_order_dispatch() -> None: df = pl.DataFrame({"x": list("bab"), "y": range(3)}) - result = df.group_by("x", maintain_order=True).count() + result = df.group_by("x", maintain_order=True).len() expected = pl.DataFrame( - {"x": ["b", "a"], "count": [2, 1]}, schema_overrides={"count": pl.UInt32} + {"x": ["b", "a"], "len": [2, 1]}, schema_overrides={"len": pl.UInt32} ) assert_frame_equal(result, expected) @@ -2409,7 +2409,7 @@ def test_group_by_slice_expression_args() -> None: out = ( df.group_by("groups", maintain_order=True) - .agg([pl.col("vals").slice(pl.count() * 0.1, (pl.count() // 5))]) + .agg([pl.col("vals").slice(pl.len() * 0.1, (pl.len() // 5))]) .explode("vals") ) diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index f61d708b9d58..07f7a2026305 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -124,7 +124,7 @@ def test_unset_sorted_on_append() -> None: ] ).sort("key") df = pl.concat([df1, df2], rechunk=False) - assert df.group_by("key").count()["count"].to_list() == [4, 4] + assert df.group_by("key").len()["len"].to_list() == [4, 4] @pytest.mark.parametrize( diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 6e8d609e8745..c50a31117964 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -1311,13 +1311,13 @@ def test_rolling_by_() -> None: out = ( df.sort("datetime") .rolling(index_column="datetime", by="group", period=timedelta(days=3)) - .agg([pl.count().alias("count")]) + .agg([pl.len().alias("count")]) ) expected = ( df.sort(["group", "datetime"]) .rolling(index_column="datetime", by="group", period="3d") - .agg([pl.count().alias("count")]) + .agg([pl.len().alias("count")]) ) assert_frame_equal(out.sort(["group", "datetime"]), expected) assert out.to_dict(as_series=False) == { @@ -2574,30 +2574,18 @@ def test_datetime_cum_agg_schema() -> None: def test_rolling_group_by_empty_groups_by_take_6330() -> None: - df = ( - pl.DataFrame({"Event": ["Rain", "Sun"]}) - .join( - pl.DataFrame( - { - "Date": [1, 2, 3, 4], - } - ), - how="cross", - ) - .set_sorted("Date") - ) - assert ( - df.rolling( - index_column="Date", - period="2i", - offset="-2i", - by="Event", - closed="left", - ).agg([pl.count()]) - ).to_dict(as_series=False) == { + df1 = pl.DataFrame({"Event": ["Rain", "Sun"]}) + df2 = pl.DataFrame({"Date": [1, 2, 3, 4]}) + df = df1.join(df2, how="cross").set_sorted("Date") + + result = df.rolling( + index_column="Date", period="2i", offset="-2i", by="Event", closed="left" + ).agg(pl.len()) + + assert result.to_dict(as_series=False) == { "Event": ["Rain", "Rain", "Rain", "Rain", "Sun", "Sun", "Sun", "Sun"], "Date": [1, 2, 3, 4, 1, 2, 3, 4], - "count": [0, 1, 2, 2, 0, 1, 2, 2], + "len": [0, 1, 2, 2, 0, 1, 2, 2], } diff --git a/py-polars/tests/unit/expr/test_exprs.py b/py-polars/tests/unit/expr/test_exprs.py index 4fa9008e5b20..25c9ce9df4c2 100644 --- a/py-polars/tests/unit/expr/test_exprs.py +++ b/py-polars/tests/unit/expr/test_exprs.py @@ -100,16 +100,16 @@ def test_filter_where() -> None: ] -def test_count_expr() -> None: +def test_len_expr() -> None: df = pl.DataFrame({"a": [1, 2, 3, 3, 3], "b": ["a", "a", "b", "a", "a"]}) - out = df.select(pl.count()) + out = df.select(pl.len()) assert out.shape == (1, 1) assert cast(int, out.item()) == 5 - out = df.group_by("b", maintain_order=True).agg(pl.count()) + out = df.group_by("b", maintain_order=True).agg(pl.len()) assert out["b"].to_list() == ["a", "b"] - assert out["count"].to_list() == [4, 1] + assert out["len"].to_list() == [4, 1] def test_map_alias() -> None: @@ -678,7 +678,7 @@ def test_head() -> None: assert df.select(pl.col("a").head(10)).to_dict(as_series=False) == { "a": [1, 2, 3, 4, 5] } - assert df.select(pl.col("a").head(pl.count() / 2)).to_dict(as_series=False) == { + assert df.select(pl.col("a").head(pl.len() / 2)).to_dict(as_series=False) == { "a": [1, 2] } @@ -690,7 +690,7 @@ def test_tail() -> None: assert df.select(pl.col("a").tail(10)).to_dict(as_series=False) == { "a": [1, 2, 3, 4, 5] } - assert df.select(pl.col("a").tail(pl.count() / 2)).to_dict(as_series=False) == { + assert df.select(pl.col("a").tail(pl.len() / 2)).to_dict(as_series=False) == { "a": [4, 5] } diff --git a/py-polars/tests/unit/functions/test_cum_count.py b/py-polars/tests/unit/functions/test_cum_count.py index 3850624af00a..bbedad60d598 100644 --- a/py-polars/tests/unit/functions/test_cum_count.py +++ b/py-polars/tests/unit/functions/test_cum_count.py @@ -9,7 +9,8 @@ @pytest.mark.parametrize(("reverse", "output"), [(False, [1, 2, 3]), (True, [3, 2, 1])]) def test_cum_count_no_args(reverse: bool, output: list[int]) -> None: df = pl.DataFrame({"a": [5, 5, None]}) - result = df.select(pl.cum_count(reverse=reverse)) + with pytest.deprecated_call(): + result = df.select(pl.cum_count(reverse=reverse)) expected = pl.Series("cum_count", output, dtype=pl.UInt32).to_frame() assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index f2f6d6556d7a..ed4967b418ec 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -1047,11 +1047,13 @@ def test_to_init_repr() -> None: def test_untrusted_categorical_input() -> None: - df = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])}) - assert pl.from_pandas(df).group_by("x").count().to_dict(as_series=False) == { - "x": ["x"], - "count": [1], - } + df_pd = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])}) + df = pl.from_pandas(df_pd) + result = df.group_by("x").len() + expected = pl.DataFrame( + {"x": ["x"], "len": [1]}, schema={"x": pl.Categorical, "len": pl.UInt32} + ) + assert_frame_equal(result, expected, categorical_as_str=True) def test_sliced_struct_from_arrow() -> None: diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index f80e5f44d0da..22e57462ae49 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -252,10 +252,10 @@ def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: Path) -> foods_file_path, dtypes={"calories": pl.String, "sugars_g": pl.Int8}, ) - .select(pl.count()) + .select(pl.len()) .collect() ) - expected = pl.DataFrame({"count": 27}, schema={"count": pl.UInt32}) + expected = pl.DataFrame({"len": 27}, schema={"len": pl.UInt32}) assert_frame_equal(df, expected) diff --git a/py-polars/tests/unit/io/test_pickle.py b/py-polars/tests/unit/io/test_pickle.py index 5e307228a67a..57cd6d954b9d 100644 --- a/py-polars/tests/unit/io/test_pickle.py +++ b/py-polars/tests/unit/io/test_pickle.py @@ -19,7 +19,7 @@ def test_pickle() -> None: def test_pickle_expr() -> None: - for e in [pl.all(), pl.count()]: + for e in [pl.all(), pl.len()]: f = io.BytesIO() pickle.dump(e, f) diff --git a/py-polars/tests/unit/namespaces/test_meta.py b/py-polars/tests/unit/namespaces/test_meta.py index 93916daa3fa3..fe554c694491 100644 --- a/py-polars/tests/unit/namespaces/test_meta.py +++ b/py-polars/tests/unit/namespaces/test_meta.py @@ -34,12 +34,12 @@ def test_root_and_output_names() -> None: assert e.meta.output_name() == "foo" assert e.meta.root_names() == ["foo", "groups"] - e = pl.sum("foo").slice(pl.count() - 10, pl.col("bar")) + e = pl.sum("foo").slice(pl.len() - 10, pl.col("bar")) assert e.meta.output_name() == "foo" assert e.meta.root_names() == ["foo", "bar"] - e = pl.count() - assert e.meta.output_name() == "count" + e = pl.len() + assert e.meta.output_name() == "len" with pytest.raises( pl.ComputeError, diff --git a/py-polars/tests/unit/operations/rolling/test_rolling.py b/py-polars/tests/unit/operations/rolling/test_rolling.py index bad3d307c94e..e30cc160f505 100644 --- a/py-polars/tests/unit/operations/rolling/test_rolling.py +++ b/py-polars/tests/unit/operations/rolling/test_rolling.py @@ -230,7 +230,7 @@ def test_rolling_extrema() -> None: ) ).with_columns( [ - pl.when(pl.int_range(0, pl.count(), eager=False) < 2) + pl.when(pl.int_range(0, pl.len(), eager=False) < 2) .then(None) .otherwise(pl.all()) .name.suffix("_nulls") @@ -815,7 +815,7 @@ def test_index_expr_with_literal() -> None: def test_index_expr_output_name_12244() -> None: df = pl.DataFrame({"A": [1, 2, 3]}) - out = df.rolling(pl.int_range(0, pl.count()), period="2i").agg("A") + out = df.rolling(pl.int_range(0, pl.len()), period="2i").agg("A") assert out.to_dict(as_series=False) == { "literal": [0, 1, 2], "A": [[1], [1, 2], [2, 3]], diff --git a/py-polars/tests/unit/operations/test_filter.py b/py-polars/tests/unit/operations/test_filter.py index 3ade166f7422..533eadd37339 100644 --- a/py-polars/tests/unit/operations/test_filter.py +++ b/py-polars/tests/unit/operations/test_filter.py @@ -131,7 +131,7 @@ def test_predicate_order_explode_5950() -> None: assert ( df.lazy() .explode("i") - .filter(pl.count().over(["i"]) == 2) + .filter(pl.len().over(["i"]) == 2) .filter(pl.col("n").is_not_null()) ).collect().to_dict(as_series=False) == {"i": [1], "n": [0]} @@ -184,8 +184,8 @@ def test_clear_window_cache_after_filter_10499() -> None: } ) - assert df.lazy().filter((pl.col("a").null_count() < pl.count()).over("b")).filter( - ((pl.col("a") == 0).sum() < pl.count()).over("b") + assert df.lazy().filter((pl.col("a").null_count() < pl.len()).over("b")).filter( + ((pl.col("a") == 0).sum() < pl.len()).over("b") ).collect().to_dict(as_series=False) == { "a": [3, None, 5, 0, 9, 10], "b": [2, 2, 3, 3, 5, 5], diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index cebbbb106ab3..0b334568b073 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -250,7 +250,7 @@ def df() -> pl.DataFrame: ("method", "expected"), [ ("all", [("a", [1, 2], [None, 1]), ("b", [3, 4, 5], [None, 1, None])]), - ("count", [("a", 2), ("b", 3)]), + ("len", [("a", 2), ("b", 3)]), ("first", [("a", 1, None), ("b", 3, None)]), ("last", [("a", 2, 1), ("b", 5, None)]), ("max", [("a", 2, 1), ("b", 5, 1)]), @@ -763,7 +763,7 @@ def test_perfect_hash_table_null_values() -> None: def test_group_by_partitioned_ending_cast(monkeypatch: Any) -> None: monkeypatch.setenv("POLARS_FORCE_PARTITION", "1") df = pl.DataFrame({"a": [1] * 5, "b": [1] * 5}) - out = df.group_by(["a", "b"]).agg(pl.count().cast(pl.Int64).alias("num")) + out = df.group_by(["a", "b"]).agg(pl.len().cast(pl.Int64).alias("num")) expected = pl.DataFrame({"a": [1], "b": [1], "num": [5]}) assert_frame_equal(out, expected) @@ -890,8 +890,8 @@ def test_group_by_with_expr_as_key() -> None: def test_lazy_group_by_reuse_11767() -> None: lgb = pl.select(x=1).lazy().group_by("x") - a = lgb.count() - b = lgb.count() + a = lgb.len() + b = lgb.len() assert_frame_equal(a, b) diff --git a/py-polars/tests/unit/operations/test_group_by_dynamic.py b/py-polars/tests/unit/operations/test_group_by_dynamic.py index 1f6799dd8005..9404b22ea52a 100644 --- a/py-polars/tests/unit/operations/test_group_by_dynamic.py +++ b/py-polars/tests/unit/operations/test_group_by_dynamic.py @@ -113,7 +113,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: include_boundaries=True, label="datapoint", start_by="datapoint", - ).agg(pl.count()).to_dict(as_series=False) == { + ).agg(pl.len()).to_dict(as_series=False) == { "_lower_boundary": [ datetime(2022, 12, 16, 0, 0, tzinfo=tzinfo), datetime(2022, 12, 16, 0, 31, tzinfo=tzinfo), @@ -138,7 +138,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: datetime(2022, 12, 16, 2, 30, tzinfo=tzinfo), datetime(2022, 12, 16, 3, 0, tzinfo=tzinfo), ], - "count": [2, 1, 1, 1, 1, 1], + "len": [2, 1, 1, 1, 1, 1], } # start by monday @@ -156,7 +156,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: include_boundaries=True, start_by="monday", label="datapoint", - ).agg([pl.count(), pl.col("day").first().alias("data_day")]) + ).agg([pl.len(), pl.col("day").first().alias("data_day")]) assert result.to_dict(as_series=False) == { "_lower_boundary": [ datetime(2022, 1, 3, 0, 0, tzinfo=tzinfo), @@ -170,7 +170,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: datetime(2022, 1, 3, 0, 0, tzinfo=tzinfo), datetime(2022, 1, 10, 0, 0, tzinfo=tzinfo), ], - "count": [6, 5], + "len": [6, 5], "data_day": [1, 1], } # start by saturday @@ -181,7 +181,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: include_boundaries=True, start_by="saturday", label="datapoint", - ).agg([pl.count(), pl.col("day").first().alias("data_day")]) + ).agg([pl.len(), pl.col("day").first().alias("data_day")]) assert result.to_dict(as_series=False) == { "_lower_boundary": [ datetime(2022, 1, 1, 0, 0, tzinfo=tzinfo), @@ -195,7 +195,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: datetime(2022, 1, 1, 0, 0, tzinfo=tzinfo), datetime(2022, 1, 8, 0, 0, tzinfo=tzinfo), ], - "count": [6, 6], + "len": [6, 6], "data_day": [6, 6], } diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index 4f606f99b6e3..097a9f93a453 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -56,7 +56,7 @@ def test_pivot_list() -> None: ("agg_fn", "expected_rows"), [ ("first", [("a", 2, None, None), ("b", None, None, 10)]), - ("count", [("a", 2, None, None), ("b", None, 2, 1)]), + ("len", [("a", 2, None, None), ("b", None, 2, 1)]), ("min", [("a", 2, None, None), ("b", None, 8, 10)]), ("max", [("a", 4, None, None), ("b", None, 8, 10)]), ("sum", [("a", 6, None, None), ("b", None, 8, 10)]), @@ -106,14 +106,12 @@ def test_pivot_categorical_index() -> None: schema=[("A", pl.Categorical), ("B", pl.Categorical)], ) - result = df.pivot(values="B", index=["A"], columns="B", aggregate_function="count") + result = df.pivot(values="B", index=["A"], columns="B", aggregate_function="len") expected = {"A": ["Fire", "Water"], "Car": [1, 2], "Ship": [1, None]} assert result.to_dict(as_series=False) == expected # test expression dispatch - result = df.pivot( - values="B", index=["A"], columns="B", aggregate_function=pl.count() - ) + result = df.pivot(values="B", index=["A"], columns="B", aggregate_function=pl.len()) assert result.to_dict(as_series=False) == expected df = pl.DataFrame( @@ -125,7 +123,7 @@ def test_pivot_categorical_index() -> None: schema=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)], ) result = df.pivot( - values="B", index=["A", "C"], columns="B", aggregate_function="count" + values="B", index=["A", "C"], columns="B", aggregate_function="len" ) expected = { "A": ["Fire", "Water"], diff --git a/py-polars/tests/unit/operations/test_random.py b/py-polars/tests/unit/operations/test_random.py index 328373a65f44..71195f46d239 100644 --- a/py-polars/tests/unit/operations/test_random.py +++ b/py-polars/tests/unit/operations/test_random.py @@ -14,7 +14,7 @@ def unique_shuffle_groups(n: int, seed: int | None) -> int: shuffled = df.group_by("group", maintain_order=True).agg( pl.col("l").shuffle(seed) ) - num_unique = shuffled.group_by("l").agg(pl.lit(0)).select(pl.count()) + num_unique = shuffled.group_by("l").agg(pl.lit(0)).select(pl.len()) return int(num_unique[0, 0]) assert unique_shuffle_groups(50, None) > 1 # Astronomically unlikely. diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py index c9c4ff5e95c8..ddde2576462e 100644 --- a/py-polars/tests/unit/operations/test_rolling.py +++ b/py-polars/tests/unit/operations/test_rolling.py @@ -57,9 +57,10 @@ def test_rolling_negative_offset_3914() -> None: ), } ) - assert df.rolling(index_column="datetime", period="2d", offset="-4d").agg( - pl.count().alias("count") - )["count"].to_list() == [0, 0, 1, 2, 2] + result = df.rolling(index_column="datetime", period="2d", offset="-4d").agg( + pl.len() + ) + assert result["len"].to_list() == [0, 0, 1, 2, 2] df = pl.DataFrame( { diff --git a/py-polars/tests/unit/operations/test_window.py b/py-polars/tests/unit/operations/test_window.py index ce4c3dd8ceff..0e23df2dc015 100644 --- a/py-polars/tests/unit/operations/test_window.py +++ b/py-polars/tests/unit/operations/test_window.py @@ -118,7 +118,7 @@ def test_window_function_cache() -> None: def test_window_range_no_rows() -> None: df = pl.DataFrame({"x": [5, 5, 4, 4, 2, 2]}) - expr = pl.int_range(0, pl.count()).over("x") + expr = pl.int_range(0, pl.len()).over("x") out = df.with_columns(int=expr) assert_frame_equal( out, pl.DataFrame({"x": [5, 5, 4, 4, 2, 2], "int": [0, 1, 0, 1, 0, 1]}) @@ -193,14 +193,14 @@ def test_cumulative_eval_window_functions() -> None: assert_frame_equal(result, expected) -def test_count_window() -> None: +def test_len_window() -> None: assert ( pl.DataFrame( { "a": [1, 1, 2], } ) - .with_columns(pl.count().over("a"))["count"] + .with_columns(pl.len().over("a"))["len"] .to_list() ) == [2, 2, 1] diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py index 2d318874aace..fd18289fdc86 100644 --- a/py-polars/tests/unit/streaming/test_streaming.py +++ b/py-polars/tests/unit/streaming/test_streaming.py @@ -232,12 +232,12 @@ def test_streaming_9776() -> None: df = pl.DataFrame({"col_1": ["a"] * 1000, "ID": [None] + ["a"] * 999}) ordered = ( df.group_by("col_1", "ID", maintain_order=True) - .count() + .len() .filter(pl.col("col_1") == "a") ) unordered = ( df.group_by("col_1", "ID", maintain_order=False) - .count() + .len() .filter(pl.col("col_1") == "a") ) expected = [("a", None, 1), ("a", "a", 999)] diff --git a/py-polars/tests/unit/streaming/test_streaming_group_by.py b/py-polars/tests/unit/streaming/test_streaming_group_by.py index e4ad5117b5b9..35715f18179c 100644 --- a/py-polars/tests/unit/streaming/test_streaming_group_by.py +++ b/py-polars/tests/unit/streaming/test_streaming_group_by.py @@ -26,12 +26,12 @@ def test_streaming_group_by_sorted_fast_path_nulls_10273() -> None: df.set_sorted("x") .lazy() .group_by("x") - .agg(pl.count()) + .agg(pl.len()) .collect(streaming=True) .sort("x") ).to_dict(as_series=False) == { "x": [None, 0, 1, 2, 3], - "count": [100, 100, 100, 100, 100], + "len": [100, 100, 100, 100, 100], } @@ -147,18 +147,14 @@ def test_streaming_group_by_min_max() -> None: def test_streaming_non_streaming_gb() -> None: n = 100 df = pl.DataFrame({"a": np.random.randint(0, 20, n)}) - q = df.lazy().group_by("a").agg(pl.count()).sort("a") + q = df.lazy().group_by("a").agg(pl.len()).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) q = df.lazy().with_columns(pl.col("a").cast(pl.String)) - q = q.group_by("a").agg(pl.count()).sort("a") + q = q.group_by("a").agg(pl.len()).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) q = df.lazy().with_columns(pl.col("a").alias("b")) - q = ( - q.group_by(["a", "b"]) - .agg(pl.count(), pl.col("a").sum().alias("sum_a")) - .sort("a") - ) + q = q.group_by(["a", "b"]).agg(pl.len(), pl.col("a").sum().alias("sum_a")).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) @@ -289,11 +285,11 @@ def test_streaming_group_by_struct_key() -> None: {"A": [1, 2, 3, 2], "B": ["google", "ms", "apple", "ms"], "C": [2, 3, 4, 3]} ) df1 = df.lazy().with_columns(pl.struct(["A", "C"]).alias("tuples")) - assert df1.group_by("tuples").agg(pl.count(), pl.col("B").first()).sort( - "B" - ).collect(streaming=True).to_dict(as_series=False) == { + assert df1.group_by("tuples").agg(pl.len(), pl.col("B").first()).sort("B").collect( + streaming=True + ).to_dict(as_series=False) == { "tuples": [{"A": 3, "C": 4}, {"A": 1, "C": 2}, {"A": 2, "C": 3}], - "count": [1, 1, 2], + "len": [1, 1, 2], "B": ["apple", "google", "ms"], } diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index fdb1ef67a0db..b9fac236edba 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -469,7 +469,7 @@ def test_cse_count_in_group_by() -> None: q = ( pl.LazyFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [40, 51, 12]}) .group_by("a") - .agg(pl.all().slice(0, pl.count() - 1)) + .agg(pl.all().slice(0, pl.len() - 1)) ) assert "POLARS_CSER" not in q.explain() @@ -527,8 +527,8 @@ def test_cse_slice_11594() -> None: df = pl.LazyFrame({"a": [1, 2, 1, 2, 1, 2]}) q = df.select( - pl.col("a").slice(offset=1, length=pl.count() - 1).alias("1"), - pl.col("a").slice(offset=1, length=pl.count() - 1).alias("2"), + pl.col("a").slice(offset=1, length=pl.len() - 1).alias("1"), + pl.col("a").slice(offset=1, length=pl.len() - 1).alias("2"), ) assert "__POLARS_CSE" in q.explain(comm_subexpr_elim=True) @@ -539,8 +539,8 @@ def test_cse_slice_11594() -> None: } q = df.select( - pl.col("a").slice(offset=1, length=pl.count() - 1).alias("1"), - pl.col("a").slice(offset=0, length=pl.count() - 1).alias("2"), + pl.col("a").slice(offset=1, length=pl.len() - 1).alias("1"), + pl.col("a").slice(offset=0, length=pl.len() - 1).alias("2"), ) assert "__POLARS_CSE" in q.explain(comm_subexpr_elim=True) diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index 34530d10772c..dda6fe590737 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -20,7 +20,7 @@ def test_error_on_empty_group_by() -> None: with pytest.raises( pl.ComputeError, match="at least one key is required in a group_by operation" ): - pl.DataFrame({"x": [0, 0, 1, 1]}).group_by([]).agg(pl.count()) + pl.DataFrame({"x": [0, 0, 1, 1]}).group_by([]).agg(pl.len()) def test_error_on_reducing_map() -> None: diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index a6957ca18c88..85330e1d4f17 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -146,11 +146,11 @@ def test_count_suffix_10783() -> None: } ) df_with_cnt = df.with_columns( - pl.count() + pl.len() .over(pl.col("a").list.sort().list.join("").hash()) .name.suffix("_suffix") ) - df_expect = df.with_columns(pl.Series("count_suffix", [3, 3, 1, 3])) + df_expect = df.with_columns(pl.Series("len_suffix", [3, 3, 1, 3])) assert_frame_equal(df_with_cnt, df_expect, check_dtype=False) @@ -1197,7 +1197,7 @@ def test_predicate_count_vstack() -> None: "v": [5, 7], } ) - assert pl.concat([l1, l2]).filter(pl.count().over("k") == 2).collect()[ + assert pl.concat([l1, l2]).filter(pl.len().over("k") == 2).collect()[ "v" ].to_list() == [3, 2, 5, 7] diff --git a/py-polars/tests/unit/test_predicates.py b/py-polars/tests/unit/test_predicates.py index c6c0147e3fd0..811b5c82c32f 100644 --- a/py-polars/tests/unit/test_predicates.py +++ b/py-polars/tests/unit/test_predicates.py @@ -203,7 +203,7 @@ def test_predicate_pushdown_group_by_keys() -> None: assert ( 'SELECTION: "None"' not in df.group_by("group") - .agg([pl.count().alias("str_list")]) + .agg([pl.len().alias("str_list")]) .filter(pl.col("group") == 1) .explain() ) @@ -388,16 +388,16 @@ def test_predicate_pushdown_with_window_projections_12637() -> None: # that only refers to the common window keys. actual = lf.with_columns( (pl.col("value") * 2).over("key").alias("value_2"), - ).filter(pl.count().over("key") == 1) + ).filter(pl.len().over("key") == 1) plan = actual.explain() - assert r'FILTER [(count().over([col("key")])) == (1)]' in plan + assert r'FILTER [(len().over([col("key")])) == (1)]' in plan assert 'SELECTION: "None"' in plan # Test window in filter - actual = lf.filter(pl.count().over("key") == 1).filter(pl.col("key") == 1) + actual = lf.filter(pl.len().over("key") == 1).filter(pl.col("key") == 1) plan = actual.explain() - assert r'FILTER [(count().over([col("key")])) == (1)]' in plan + assert r'FILTER [(len().over([col("key")])) == (1)]' in plan assert r'SELECTION: "[(col(\"key\")) == (1)]"' in plan diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index 199c5b6f2659..0a8c180806a1 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -275,18 +275,22 @@ def test_merge_sorted_projection_pd() -> None: def test_distinct_projection_pd_7578() -> None: - df = pl.DataFrame( + lf = pl.LazyFrame( { "foo": ["0", "1", "2", "1", "2"], "bar": ["a", "a", "a", "b", "b"], } ) - q = df.lazy().unique().group_by("bar").agg(pl.count()) - assert q.collect().sort("bar").to_dict(as_series=False) == { - "bar": ["a", "b"], - "count": [3, 2], - } + result = lf.unique().group_by("bar").agg(pl.len()) + expected = pl.LazyFrame( + { + "bar": ["a", "b"], + "len": [3, 2], + }, + schema_overrides={"len": pl.UInt32}, + ) + assert_frame_equal(result, expected, check_row_order=False) def test_join_suffix_collision_9562() -> None: @@ -351,7 +355,7 @@ def test_projection_rename_10595() -> None: def test_projection_count_11841() -> None: - pl.LazyFrame({"x": 1}).select(records=pl.count()).select( + pl.LazyFrame({"x": 1}).select(records=pl.len()).select( pl.lit(1).alias("x"), pl.all() ).collect() diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py index 08edd662a7d4..1a28b608ae06 100644 --- a/py-polars/tests/unit/test_queries.py +++ b/py-polars/tests/unit/test_queries.py @@ -34,7 +34,7 @@ def test_repeat_expansion_in_group_by() -> None: out = ( pl.DataFrame({"g": [1, 2, 2, 3, 3, 3]}) .group_by("g", maintain_order=True) - .agg(pl.repeat(1, pl.count()).cum_sum()) + .agg(pl.repeat(1, pl.len()).cum_sum()) .to_dict(as_series=False) ) assert out == {"g": [1, 2, 3], "repeat": [[1], [1, 2], [1, 2, 3]]} @@ -126,10 +126,10 @@ def test_sorted_group_by_optimization(monkeypatch: Any) -> None: sorted_implicit = ( df.with_columns(pl.col("a").sort(descending=descending)) .group_by("a") - .agg(pl.count()) + .agg(pl.len()) ) sorted_explicit = ( - df.group_by("a").agg(pl.count()).sort("a", descending=descending) + df.group_by("a").agg(pl.len()).sort("a", descending=descending) ) assert_frame_equal(sorted_explicit, sorted_implicit) @@ -258,7 +258,7 @@ def map_expr(name: str) -> pl.Expr: pl.struct( [ pl.sum(name).alias("sum"), - (pl.count() - pl.col(name).null_count()).alias("count"), + (pl.len() - pl.col(name).null_count()).alias("count"), ] ), ) diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 673c3fa86dcf..f5eb9a8e4b57 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -150,8 +150,7 @@ def test_bool_numeric_supertype() -> None: pl.Int64, ]: assert ( - df.select([(pl.col("v") < 3).sum().cast(dt) / pl.count()]).item() - - 0.3333333 + df.select([(pl.col("v") < 3).sum().cast(dt) / pl.len()]).item() - 0.3333333 <= 0.00001 ) @@ -631,5 +630,5 @@ def test_literal_subtract_schema_13284() -> None: pl.LazyFrame({"a": [23, 30]}, schema={"a": pl.UInt8}) .with_columns(pl.col("a") - pl.lit(1)) .group_by(by="a") - .count() - ).schema == OrderedDict([("a", pl.UInt8), ("count", pl.UInt32)]) + .len() + ).schema == OrderedDict([("a", pl.UInt8), ("len", pl.UInt32)])