From 747b8b028a26ca4801a7bfc0c7494642bc740455 Mon Sep 17 00:00:00 2001
From: Cristine Guadelupe <cristineguadelupe@me.com>
Date: Wed, 6 Dec 2023 15:40:00 -0300
Subject: [PATCH 1/3] Add pairwise correlation

---
 lib/explorer/data_frame.ex            | 49 +++++++++++++++++++++++++++
 lib/explorer/polars_backend/native.ex |  3 +-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
index a62b0d46f..f84f8d1f7 100644
--- a/lib/explorer/data_frame.ex
+++ b/lib/explorer/data_frame.ex
@@ -5657,6 +5657,55 @@ defmodule Explorer.DataFrame do
 
   def frequencies(_df, []), do: raise(ArgumentError, "columns cannot be empty")
 
+  @doc """
+  Calculates the pairwise correlation of numeric columns.
+
+  ## Supported dtypes
+
+  Only columns with the following dtypes are taken into account.
+
+  * `:integer`
+  * `{:f, 32}`
+  * `{:f, 64}`
+
+  ## Options
+
+  * `:columns` - the selection of columns to calculate. Defaults to all numeric columns.
+  * `:column_name` - the name of the column with variable names. Defaults to "names".
+  * `:ddof` - the 'delta degrees of freedom' - the divisor used in the correlation
+    calculation. Defaults to 1.
+   
+  ## Examples
+
+      iex> df = Explorer.DataFrame.new(dogs: [1, 8, 3], cats: [4, 5, 2])
+      iex> Explorer.DataFrame.correlation(df)
+      #Explorer.DataFrame<
+        Polars[2 x 3]
+        names string ["dogs", "cats"]
+        dogs f64 [1.0000000000000002, 0.5447047794019219]
+        cats f64 [0.5447047794019219, 1.0]
+      >
+  """
+  @doc type: :single
+  @spec correlation(df :: DataFrame.t(), opts :: Keyword.t()) :: df :: DataFrame.t()
+  def correlation(df, opts \\ []) do
+    opts = Keyword.validate!(opts, column_name: "names", columns: names(df), ddof: 1)
+
+    cols =
+      df
+      |> to_existing_columns(opts[:columns])
+      |> Enum.filter(fn name -> numeric_column?(df, name) end)
+
+    result = for l <- cols, r <- cols, do: Series.correlation(df[l], df[r], opts[:ddof])
+    values = Enum.chunk_every(result, length(cols))
+
+    new([{opts[:column_name], cols} | Enum.zip(cols, values)])
+  end
+
+  defp numeric_column?(df, name) do
+    Series.dtype(df[name]) in [:integer | Explorer.Shared.float_types()]
+  end
+
   # Helpers
 
   defp backend_from_options!(opts) do
diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex
index 1c97011af..7cc8187fa 100644
--- a/lib/explorer/polars_backend/native.ex
+++ b/lib/explorer/polars_backend/native.ex
@@ -6,7 +6,8 @@ defmodule Explorer.PolarsBackend.Native do
   github_url = mix_config[:package][:links]["GitHub"]
   # Since Rustler 0.27.0, we need to change manually the mode for each env.
   # We want "debug" in dev and test because it's faster to compile.
-  mode = if Mix.env() in [:dev, :test], do: :debug, else: :release
+  # mode = if Mix.env() in [:dev, :test], do: :debug, else: :release
+  mode = :debug
 
   use_legacy =
     Application.compile_env(

From 6742ba2a7f8af3ce78cb4552ee9ad42d026eab38 Mon Sep 17 00:00:00 2001
From: Philip Sampaio <philip.sampaio@gmail.com>
Date: Wed, 6 Dec 2023 17:09:25 -0300
Subject: [PATCH 2/3] Move DF.correlation/2 implementation to the backend

The idea is to make clear that this won't work yet for
lazy frames.

Co-authored-by: Cristine Guadelupe <cristineguadelupe@me.com>
---
 lib/explorer/backend/data_frame.ex        |  1 +
 lib/explorer/data_frame.ex                | 14 +++---
 lib/explorer/polars_backend/data_frame.ex | 20 +++++++++
 lib/explorer/polars_backend/lazy_frame.ex |  1 +
 lib/explorer/polars_backend/native.ex     |  3 +-
 test/explorer/data_frame_test.exs         | 55 +++++++++++++++++++++++
 6 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex
index f64e715e7..1a19d2a55 100644
--- a/lib/explorer/backend/data_frame.ex
+++ b/lib/explorer/backend/data_frame.ex
@@ -203,6 +203,7 @@ defmodule Explorer.Backend.DataFrame do
   @callback nil_count(df) :: df()
   @callback explode(df, out_df :: df(), columns :: [column_name()]) :: df()
   @callback unnest(df, out_df :: df(), columns :: [column_name()]) :: df()
+  @callback correlation(df, out_df :: df(), ddof :: integer()) :: df()
 
   # Two or more table verbs
 
diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
index f84f8d1f7..23d4293c0 100644
--- a/lib/explorer/data_frame.ex
+++ b/lib/explorer/data_frame.ex
@@ -5668,13 +5668,15 @@ defmodule Explorer.DataFrame do
   * `{:f, 32}`
   * `{:f, 64}`
 
+  The resultant columns are always `{:f, 64}`.
+
   ## Options
 
   * `:columns` - the selection of columns to calculate. Defaults to all numeric columns.
-  * `:column_name` - the name of the column with variable names. Defaults to "names".
+  * `:column_name` - the name of the column with column names. Defaults to "names".
   * `:ddof` - the 'delta degrees of freedom' - the divisor used in the correlation
     calculation. Defaults to 1.
-   
+
   ## Examples
 
       iex> df = Explorer.DataFrame.new(dogs: [1, 8, 3], cats: [4, 5, 2])
@@ -5691,15 +5693,17 @@ defmodule Explorer.DataFrame do
   def correlation(df, opts \\ []) do
     opts = Keyword.validate!(opts, column_name: "names", columns: names(df), ddof: 1)
 
+    column_name = to_column_name(opts[:column_name])
+
     cols =
       df
       |> to_existing_columns(opts[:columns])
       |> Enum.filter(fn name -> numeric_column?(df, name) end)
 
-    result = for l <- cols, r <- cols, do: Series.correlation(df[l], df[r], opts[:ddof])
-    values = Enum.chunk_every(result, length(cols))
+    out_dtypes = for col <- cols, into: %{column_name => :string}, do: {col, {:f, 64}}
+    out_df = %{df | dtypes: out_dtypes, names: [column_name | cols]}
 
-    new([{opts[:column_name], cols} | Enum.zip(cols, values)])
+    Shared.apply_impl(df, :correlation, [out_df, opts[:ddof]])
   end
 
   defp numeric_column?(df, name) do
diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex
index f352893a3..50488e242 100644
--- a/lib/explorer/polars_backend/data_frame.ex
+++ b/lib/explorer/polars_backend/data_frame.ex
@@ -764,6 +764,26 @@ defmodule Explorer.PolarsBackend.DataFrame do
     Shared.apply_dataframe(df, out_df, :df_unnest, [columns])
   end
 
+  @impl true
+  def correlation(df, out_df, ddof) do
+    [column_name | cols] = out_df.names
+
+    correlations =
+      Enum.map(cols, fn left ->
+        corr_series =
+          cols
+          |> Enum.map(fn right -> PolarsSeries.correlation(df[left], df[right], ddof) end)
+          |> Shared.from_list({:f, 64})
+          |> Shared.create_series()
+
+        {left, corr_series}
+      end)
+
+    names_series = cols |> Shared.from_list(:string) |> Shared.create_series()
+
+    from_series([{column_name, names_series} | correlations])
+  end
+
   # Two or more table verbs
 
   @impl true
diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex
index b95cff512..a1b71ccf7 100644
--- a/lib/explorer/polars_backend/lazy_frame.ex
+++ b/lib/explorer/polars_backend/lazy_frame.ex
@@ -488,6 +488,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
   end
 
   not_available_funs = [
+    correlation: 3,
     describe: 2,
     nil_count: 1,
     dummies: 3,
diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex
index 7cc8187fa..1c97011af 100644
--- a/lib/explorer/polars_backend/native.ex
+++ b/lib/explorer/polars_backend/native.ex
@@ -6,8 +6,7 @@ defmodule Explorer.PolarsBackend.Native do
   github_url = mix_config[:package][:links]["GitHub"]
   # Since Rustler 0.27.0, we need to change manually the mode for each env.
   # We want "debug" in dev and test because it's faster to compile.
-  # mode = if Mix.env() in [:dev, :test], do: :debug, else: :release
-  mode = :debug
+  mode = if Mix.env() in [:dev, :test], do: :debug, else: :release
 
   use_legacy =
     Application.compile_env(
diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs
index c5266826c..bcf82c64a 100644
--- a/test/explorer/data_frame_test.exs
+++ b/test/explorer/data_frame_test.exs
@@ -3855,4 +3855,59 @@ defmodule Explorer.DataFrameTest do
                    fn -> DF.unnest(df, [:a, :b]) end
     end
   end
+
+  describe "correlation/2" do
+    test "two integer columns" do
+      df = DF.new(dogs: [1, 8, 3], cats: [4, 5, 2])
+      df1 = DF.correlation(df)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               names: ["dogs", "cats"],
+               dogs: [1.0000000000000002, 0.5447047794019219],
+               cats: [0.5447047794019219, 1.0]
+             }
+    end
+
+    test "three integer columns and custom column name" do
+      df = DF.new(dogs: [1, 2, 3], cats: [3, 2, 1], frogs: [7, 8, 9])
+      df1 = DF.correlation(df, column_name: "variables")
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               variables: ["dogs", "cats", "frogs"],
+               dogs: [1.0, -1.0, 1.0],
+               cats: [-1.0, 1.0, -1.0],
+               frogs: [1.0, -1.0, 1.0]
+             }
+    end
+
+    test "two float columns" do
+      df = DF.new(dogs: [1.4, 8.6, 3.7], cats: [4.1, 5.3, 2.2])
+      df1 = DF.correlation(df)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               names: ["dogs", "cats"],
+               dogs: [0.9999999999999999, 0.5642328261411999],
+               cats: [0.5642328261411999, 0.9999999999999998]
+             }
+    end
+
+    test "one column" do
+      df = DF.new(cats: [4, 5, 2])
+      df1 = DF.correlation(df)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               names: ["cats"],
+               cats: [1.0]
+             }
+    end
+
+    test "no numeric columns" do
+      df = DF.new(cats: ["susie", "tuka", "tobias", "terror"])
+      df1 = DF.correlation(df)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               names: []
+             }
+    end
+  end
 end

From a20e098af8c23f45d96a7ac5b61fa94737f419de Mon Sep 17 00:00:00 2001
From: Philip Sampaio <philip.sampaio@gmail.com>
Date: Thu, 7 Dec 2023 21:22:33 -0300
Subject: [PATCH 3/3] Add note about correlation type we are using.

---
 lib/explorer/data_frame.ex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
index 23d4293c0..4cefa10c1 100644
--- a/lib/explorer/data_frame.ex
+++ b/lib/explorer/data_frame.ex
@@ -5658,7 +5658,7 @@ defmodule Explorer.DataFrame do
   def frequencies(_df, []), do: raise(ArgumentError, "columns cannot be empty")
 
   @doc """
-  Calculates the pairwise correlation of numeric columns.
+  Calculates the pairwise Pearson's correlation of numeric columns.
 
   ## Supported dtypes