From 747b8b028a26ca4801a7bfc0c7494642bc740455 Mon Sep 17 00:00:00 2001 From: Cristine Guadelupe Date: Wed, 6 Dec 2023 15:40:00 -0300 Subject: [PATCH 1/3] Add pairwise correlation --- lib/explorer/data_frame.ex | 49 +++++++++++++++++++++++++++ lib/explorer/polars_backend/native.ex | 3 +- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index a62b0d46f..f84f8d1f7 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -5657,6 +5657,55 @@ defmodule Explorer.DataFrame do def frequencies(_df, []), do: raise(ArgumentError, "columns cannot be empty") + @doc """ + Calculates the pairwise correlation of numeric columns. + + ## Supported dtypes + + Only columns with the following dtypes are taken into account. + + * `:integer` + * `{:f, 32}` + * `{:f, 64}` + + ## Options + + * `:columns` - the selection of columns to calculate. Defaults to all numeric columns. + * `:column_name` - the name of the column with variable names. Defaults to "names". + * `:ddof` - the 'delta degrees of freedom' - the divisor used in the correlation + calculation. Defaults to 1. + + ## Examples + + iex> df = Explorer.DataFrame.new(dogs: [1, 8, 3], cats: [4, 5, 2]) + iex> Explorer.DataFrame.correlation(df) + #Explorer.DataFrame< + Polars[2 x 3] + names string ["dogs", "cats"] + dogs f64 [1.0000000000000002, 0.5447047794019219] + cats f64 [0.5447047794019219, 1.0] + > + """ + @doc type: :single + @spec correlation(df :: DataFrame.t(), opts :: Keyword.t()) :: df :: DataFrame.t() + def correlation(df, opts \\ []) do + opts = Keyword.validate!(opts, column_name: "names", columns: names(df), ddof: 1) + + cols = + df + |> to_existing_columns(opts[:columns]) + |> Enum.filter(fn name -> numeric_column?(df, name) end) + + result = for l <- cols, r <- cols, do: Series.correlation(df[l], df[r], opts[:ddof]) + values = Enum.chunk_every(result, length(cols)) + + new([{opts[:column_name], cols} | Enum.zip(cols, values)]) + end + + defp numeric_column?(df, name) do + Series.dtype(df[name]) in [:integer | Explorer.Shared.float_types()] + end + # Helpers defp backend_from_options!(opts) do diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 1c97011af..7cc8187fa 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -6,7 +6,8 @@ defmodule Explorer.PolarsBackend.Native do github_url = mix_config[:package][:links]["GitHub"] # Since Rustler 0.27.0, we need to change manually the mode for each env. # We want "debug" in dev and test because it's faster to compile. - mode = if Mix.env() in [:dev, :test], do: :debug, else: :release + # mode = if Mix.env() in [:dev, :test], do: :debug, else: :release + mode = :debug use_legacy = Application.compile_env( From 6742ba2a7f8af3ce78cb4552ee9ad42d026eab38 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 6 Dec 2023 17:09:25 -0300 Subject: [PATCH 2/3] Move DF.correlation/2 implementation to the backend The idea is to make clear that this won't work yet for lazy frames. Co-authored-by: Cristine Guadelupe --- lib/explorer/backend/data_frame.ex | 1 + lib/explorer/data_frame.ex | 14 +++--- lib/explorer/polars_backend/data_frame.ex | 20 +++++++++ lib/explorer/polars_backend/lazy_frame.ex | 1 + lib/explorer/polars_backend/native.ex | 3 +- test/explorer/data_frame_test.exs | 55 +++++++++++++++++++++++ 6 files changed, 87 insertions(+), 7 deletions(-) diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex index f64e715e7..1a19d2a55 100644 --- a/lib/explorer/backend/data_frame.ex +++ b/lib/explorer/backend/data_frame.ex @@ -203,6 +203,7 @@ defmodule Explorer.Backend.DataFrame do @callback nil_count(df) :: df() @callback explode(df, out_df :: df(), columns :: [column_name()]) :: df() @callback unnest(df, out_df :: df(), columns :: [column_name()]) :: df() + @callback correlation(df, out_df :: df(), ddof :: integer()) :: df() # Two or more table verbs diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index f84f8d1f7..23d4293c0 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -5668,13 +5668,15 @@ defmodule Explorer.DataFrame do * `{:f, 32}` * `{:f, 64}` + The resultant columns are always `{:f, 64}`. + ## Options * `:columns` - the selection of columns to calculate. Defaults to all numeric columns. - * `:column_name` - the name of the column with variable names. Defaults to "names". + * `:column_name` - the name of the column with column names. Defaults to "names". * `:ddof` - the 'delta degrees of freedom' - the divisor used in the correlation calculation. Defaults to 1. - + ## Examples iex> df = Explorer.DataFrame.new(dogs: [1, 8, 3], cats: [4, 5, 2]) @@ -5691,15 +5693,17 @@ defmodule Explorer.DataFrame do def correlation(df, opts \\ []) do opts = Keyword.validate!(opts, column_name: "names", columns: names(df), ddof: 1) + column_name = to_column_name(opts[:column_name]) + cols = df |> to_existing_columns(opts[:columns]) |> Enum.filter(fn name -> numeric_column?(df, name) end) - result = for l <- cols, r <- cols, do: Series.correlation(df[l], df[r], opts[:ddof]) - values = Enum.chunk_every(result, length(cols)) + out_dtypes = for col <- cols, into: %{column_name => :string}, do: {col, {:f, 64}} + out_df = %{df | dtypes: out_dtypes, names: [column_name | cols]} - new([{opts[:column_name], cols} | Enum.zip(cols, values)]) + Shared.apply_impl(df, :correlation, [out_df, opts[:ddof]]) end defp numeric_column?(df, name) do diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex index f352893a3..50488e242 100644 --- a/lib/explorer/polars_backend/data_frame.ex +++ b/lib/explorer/polars_backend/data_frame.ex @@ -764,6 +764,26 @@ defmodule Explorer.PolarsBackend.DataFrame do Shared.apply_dataframe(df, out_df, :df_unnest, [columns]) end + @impl true + def correlation(df, out_df, ddof) do + [column_name | cols] = out_df.names + + correlations = + Enum.map(cols, fn left -> + corr_series = + cols + |> Enum.map(fn right -> PolarsSeries.correlation(df[left], df[right], ddof) end) + |> Shared.from_list({:f, 64}) + |> Shared.create_series() + + {left, corr_series} + end) + + names_series = cols |> Shared.from_list(:string) |> Shared.create_series() + + from_series([{column_name, names_series} | correlations]) + end + # Two or more table verbs @impl true diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex index b95cff512..a1b71ccf7 100644 --- a/lib/explorer/polars_backend/lazy_frame.ex +++ b/lib/explorer/polars_backend/lazy_frame.ex @@ -488,6 +488,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do end not_available_funs = [ + correlation: 3, describe: 2, nil_count: 1, dummies: 3, diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 7cc8187fa..1c97011af 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -6,8 +6,7 @@ defmodule Explorer.PolarsBackend.Native do github_url = mix_config[:package][:links]["GitHub"] # Since Rustler 0.27.0, we need to change manually the mode for each env. # We want "debug" in dev and test because it's faster to compile. - # mode = if Mix.env() in [:dev, :test], do: :debug, else: :release - mode = :debug + mode = if Mix.env() in [:dev, :test], do: :debug, else: :release use_legacy = Application.compile_env( diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index c5266826c..bcf82c64a 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -3855,4 +3855,59 @@ defmodule Explorer.DataFrameTest do fn -> DF.unnest(df, [:a, :b]) end end end + + describe "correlation/2" do + test "two integer columns" do + df = DF.new(dogs: [1, 8, 3], cats: [4, 5, 2]) + df1 = DF.correlation(df) + + assert DF.to_columns(df1, atom_keys: true) == %{ + names: ["dogs", "cats"], + dogs: [1.0000000000000002, 0.5447047794019219], + cats: [0.5447047794019219, 1.0] + } + end + + test "three integer columns and custom column name" do + df = DF.new(dogs: [1, 2, 3], cats: [3, 2, 1], frogs: [7, 8, 9]) + df1 = DF.correlation(df, column_name: "variables") + + assert DF.to_columns(df1, atom_keys: true) == %{ + variables: ["dogs", "cats", "frogs"], + dogs: [1.0, -1.0, 1.0], + cats: [-1.0, 1.0, -1.0], + frogs: [1.0, -1.0, 1.0] + } + end + + test "two float columns" do + df = DF.new(dogs: [1.4, 8.6, 3.7], cats: [4.1, 5.3, 2.2]) + df1 = DF.correlation(df) + + assert DF.to_columns(df1, atom_keys: true) == %{ + names: ["dogs", "cats"], + dogs: [0.9999999999999999, 0.5642328261411999], + cats: [0.5642328261411999, 0.9999999999999998] + } + end + + test "one column" do + df = DF.new(cats: [4, 5, 2]) + df1 = DF.correlation(df) + + assert DF.to_columns(df1, atom_keys: true) == %{ + names: ["cats"], + cats: [1.0] + } + end + + test "no numeric columns" do + df = DF.new(cats: ["susie", "tuka", "tobias", "terror"]) + df1 = DF.correlation(df) + + assert DF.to_columns(df1, atom_keys: true) == %{ + names: [] + } + end + end end From a20e098af8c23f45d96a7ac5b61fa94737f419de Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Thu, 7 Dec 2023 21:22:33 -0300 Subject: [PATCH 3/3] Add note about correlation type we are using. --- lib/explorer/data_frame.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 23d4293c0..4cefa10c1 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -5658,7 +5658,7 @@ defmodule Explorer.DataFrame do def frequencies(_df, []), do: raise(ArgumentError, "columns cannot be empty") @doc """ - Calculates the pairwise correlation of numeric columns. + Calculates the pairwise Pearson's correlation of numeric columns. ## Supported dtypes