-
Notifications
You must be signed in to change notification settings - Fork 121
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add DataFrame.with_row_count/2 #833
Conversation
lib/explorer/data_frame.ex
Outdated
""" | ||
@doc type: :single | ||
@spec with_row_count(df :: DataFrame.t(), opts :: Keyword.t()) :: DataFrame.t() | ||
def with_row_count(%DataFrame{} = df, opts \\ []) do |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if this should be with_row_count(%DataFrame{} = df, name, index)
test/explorer/data_frame_test.exs
Outdated
@@ -4234,4 +4234,42 @@ defmodule Explorer.DataFrameTest do | |||
} | |||
end | |||
end | |||
|
|||
describe "with_row_count/2" do |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems redundant with doctests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that with_row_index is better, but we can use this name in Explorer and fallback to with_row_count in Polars. Although I’d like @cigrainger’s opinion on the name. We really don’t have functions starting with “with_”.
85b1ed8
to
9b86a4f
Compare
I'm open to other names, but I'm personally good with |
I'm not a huge fan of |
This convinced me on going with However...
Ideally we would implement it with the above, and then |
9b86a4f
to
61cde22
Compare
Interesting. So what if we:
Would that work? Perhaps we try that as a separate PR so we can compare them side by side? |
I like that idea a lot @josevalim |
I'll make a proper PR later. This works, but I don't know if it's the correct implementation. I need some guidance for
diffcommit 3d2a8bda28f8072e081b2fb248f72bad0bb76a34
Author: Iuri Mateus <20242530+iurimateus@users.noreply.github.com>
Date: Tue Jan 23 09:17:16 2024 -0300
wip
diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex
index e6461a6..e99204a 100644
--- a/lib/explorer/backend/lazy_series.ex
+++ b/lib/explorer/backend/lazy_series.ex
@@ -141,7 +141,9 @@ defmodule Explorer.Backend.LazySeries do
# List functions
join: 2,
lengths: 1,
- member: 3
+ member: 3,
+ # ?
+ len: 0
]
@comparison_operations [:equal, :not_equal, :greater, :greater_equal, :less, :less_equal]
@@ -1074,6 +1076,47 @@ defmodule Explorer.Backend.LazySeries do
Backend.Series.new(data, :boolean)
end
+ def len() do
+ data = new(:len, [], {:u, 32})
+
+ Backend.Series.new(data, {:u, 32})
+ end
+
+ def row_index() do
+ data = new(:row_index, [], {:u, 32})
+
+ Backend.Series.new(data, {:u, 32})
+ end
+
+ def int_range(start, end_, step \\ 1) do
+ # TODO check boundaries?
+
+ # ** (RuntimeError) Polars Error: lengths don't match: unable to add a column of length 2 to
+ # a DataFrame of height 3
+
+ # TODO Should we also accept a non lazy Series?
+ {start, end_} =
+ case {start, end_} do
+ {%Series{}, %Series{}} ->
+ {lazy_series!(start), lazy_series!(end_)}
+
+ {%Series{}, _} ->
+ {lazy_series!(start), end_}
+
+ {_, %Series{}} ->
+ {start, lazy_series!(end_)}
+
+ {v1, v2} when is_integer(v1) and is_integer(v2) ->
+ {v1, v2}
+
+ {_, _} ->
+ raise ArgumentError, "invalid arguments"
+ end
+
+ data = new(:int_range, [start, end_, step], {:s, 64})
+ Backend.Series.new(data, {:s, 64})
+ end
+
@remaining_non_lazy_operations [
at: 2,
at_every: 2,
diff --git a/lib/explorer/polars_backend/expression.ex b/lib/explorer/polars_backend/expression.ex
index e2581a8..1dd50b2 100644
--- a/lib/explorer/polars_backend/expression.ex
+++ b/lib/explorer/polars_backend/expression.ex
@@ -155,6 +155,9 @@ defmodule Explorer.PolarsBackend.Expression do
shift: 3,
slice: 2,
slice: 3,
+ row_index: 0,
+ len: 0,
+ int_range: 4,
concat: 1,
column: 1,
correlation: 4,
@@ -290,6 +293,22 @@ defmodule Explorer.PolarsBackend.Expression do
end
end
+ def to_expr(%LazySeries{op: :len, args: []}) do
+ Native.expr_len()
+ end
+
+ def to_expr(%LazySeries{op: :row_index, args: []}) do
+ Native.expr_int_range(to_expr(0), Native.expr_len(), 1, {:u, 32})
+ end
+
+ def to_expr(%LazySeries{
+ op: :int_range,
+ args: [start, end_, step],
+ dtype: dtype
+ }) do
+ Native.expr_int_range(to_expr(start), to_expr(end_), step, dtype)
+ end
+
for {op, arity} <- @all_expressions do
args = Macro.generate_arguments(arity, __MODULE__)
diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex
index 8557778..1f4b8c6 100644
--- a/lib/explorer/polars_backend/native.ex
+++ b/lib/explorer/polars_backend/native.ex
@@ -213,6 +213,9 @@ defmodule Explorer.PolarsBackend.Native do
def expr_series(_series), do: err()
def expr_string(_string), do: err()
+ # def expr_len(), do: err()
+ def expr_int_range(_start, _end, _step, _dtype), do: err()
+
# LazyFrame
def lf_collect(_df), do: err()
def lf_describe_plan(_df, _optimized), do: err()
diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex
index ca9bc66..c5b3121 100644
--- a/lib/explorer/series.ex
+++ b/lib/explorer/series.ex
@@ -5985,6 +5985,18 @@ defmodule Explorer.Series do
def member?(%Series{dtype: dtype}, _value),
do: dtype_error("member?/2", dtype, [{:list, :_}])
+ def len() do
+ Explorer.Backend.LazySeries.len()
+ end
+
+ def row_index() do
+ Explorer.Backend.LazySeries.row_index()
+ end
+
+ def int_range(start, end_, step \\ 1) do
+ Explorer.Backend.LazySeries.int_range(start, end_, step)
+ end
+
# Escape hatch
@doc """
diff --git a/native/explorer/Cargo.toml b/native/explorer/Cargo.toml
index 96fbc89..8765393 100644
--- a/native/explorer/Cargo.toml
+++ b/native/explorer/Cargo.toml
@@ -77,6 +77,7 @@ features = [
"product",
"peaks",
"moment",
+ "range",
"rank",
"propagate_nans",
]
diff --git a/native/explorer/src/expressions.rs b/native/explorer/src/expressions.rs
index b433d8c..c731fe0 100644
--- a/native/explorer/src/expressions.rs
+++ b/native/explorer/src/expressions.rs
@@ -4,6 +4,7 @@
// or an expression and returns an expression that is
// wrapped in an Elixir struct.
+use polars::lazy::dsl;
use polars::prelude::{
col, concat_str, cov, pearson_corr, spearman_rank_corr, when, IntoLazy, LiteralValue,
SortOptions,
@@ -1042,6 +1043,22 @@ pub fn expr_join(expr: ExExpr, sep: String) -> ExExpr {
ExExpr::new(expr.list().join(sep.lit()))
}
+#[rustler::nif]
+pub fn expr_int_range(start: ExExpr, end: ExExpr, step: i64, dtype: ExSeriesDtype) -> ExExpr {
+ let start = start.clone_inner();
+ let end = end.clone_inner();
+ let dtype = DataType::try_from(&dtype).unwrap();
+ let expr = dsl::int_range(start, end, step, dtype);
+
+ ExExpr::new(expr)
+}
+
+#[rustler::nif]
+pub fn expr_len() -> ExExpr {
+ let expr = dsl::count();
+ ExExpr::new(expr)
+}
+
#[rustler::nif]
pub fn expr_lengths(expr: ExExpr) -> ExExpr {
let expr = expr.clone_inner();
diff --git a/native/explorer/src/lib.rs b/native/explorer/src/lib.rs
index 02f7c21..8409c46 100644
--- a/native/explorer/src/lib.rs
+++ b/native/explorer/src/lib.rs
@@ -166,6 +166,8 @@ rustler::init!(
expr_float,
expr_head,
expr_integer,
+ expr_int_range,
+ expr_len,
expr_peaks,
expr_rank,
expr_unary_not,
diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs
index 8a52c9a..533d0ed 100644
--- a/test/explorer/data_frame_test.exs
+++ b/test/explorer/data_frame_test.exs
@@ -4234,4 +4234,45 @@ defmodule Explorer.DataFrameTest do
}
end
end
+
+ describe "row_index/2" do
+ test "int_range" do
+ df = DF.new(a: [1, 3, 5], b: [2, 4, 6])
+
+ df1 = DF.mutate(df, x: len())
+
+ assert DF.to_columns(df1, atom_keys: true) == %{
+ a: [1, 3, 5],
+ b: [2, 4, 6],
+ x: [3, 3, 3]
+ }
+
+ df2 = DF.mutate(df, id: row_index()) |> DF.collect()
+
+ assert DF.to_columns(df2, atom_keys: true) == %{
+ a: [1, 3, 5],
+ b: [2, 4, 6],
+ id: [0, 1, 2]
+ }
+
+ df3 = DF.mutate(df, id: int_range(0, len()))
+ assert DF.to_columns(df3) == DF.to_columns(df2)
+
+ df4 = DF.mutate_with(df, fn _ -> [id: Series.row_index()] end)
+
+ assert DF.to_columns(df4, atom_keys: true) == %{
+ a: [1, 3, 5],
+ b: [2, 4, 6],
+ id: [0, 1, 2]
+ }
+
+ df5 = DF.mutate(df, x: int_range(0, 15, 5))
+
+ assert DF.to_columns(df5, atom_keys: true) == %{
+ a: [1, 3, 5],
+ b: [2, 4, 6],
+ x: [0, 5, 10]
+ }
+ end
+ end
end |
Superseded by #862 |
Do note that
with_row_count
was renamed in polars/main towith_row_index
(ref pola-rs/polars#13494), so perhaps we should wait for a new polars release?