elixir-explorer · philss · Dec 8, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 8, 2023
diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex
@@ -203,6 +203,7 @@ defmodule Explorer.Backend.DataFrame do
   @callback nil_count(df) :: df()
   @callback explode(df, out_df :: df(), columns :: [column_name()]) :: df()
   @callback unnest(df, out_df :: df(), columns :: [column_name()]) :: df()
+  @callback correlation(df, out_df :: df(), ddof :: integer()) :: df()
 
   # Two or more table verbs
 

diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
@@ -5657,6 +5657,59 @@ defmodule Explorer.DataFrame do
 
   def frequencies(_df, []), do: raise(ArgumentError, "columns cannot be empty")
 
+  @doc """
+  Calculates the pairwise Pearson's correlation of numeric columns.
+
+  ## Supported dtypes
+
+  Only columns with the following dtypes are taken into account.
+
+  * `:integer`
+  * `{:f, 32}`
+  * `{:f, 64}`
+
+  The resultant columns are always `{:f, 64}`.
+
+  ## Options
+
+  * `:columns` - the selection of columns to calculate. Defaults to all numeric columns.
+  * `:column_name` - the name of the column with column names. Defaults to "names".
+  * `:ddof` - the 'delta degrees of freedom' - the divisor used in the correlation
+    calculation. Defaults to 1.
+
+  ## Examples
+
+      iex> df = Explorer.DataFrame.new(dogs: [1, 8, 3], cats: [4, 5, 2])
+      iex> Explorer.DataFrame.correlation(df)
+      #Explorer.DataFrame<
+        Polars[2 x 3]
+        names string ["dogs", "cats"]
+        dogs f64 [1.0000000000000002, 0.5447047794019219]
+        cats f64 [0.5447047794019219, 1.0]
+      >
+  """
+  @doc type: :single
+  @spec correlation(df :: DataFrame.t(), opts :: Keyword.t()) :: df :: DataFrame.t()
+  def correlation(df, opts \\ []) do
+    opts = Keyword.validate!(opts, column_name: "names", columns: names(df), ddof: 1)
+
+    column_name = to_column_name(opts[:column_name])
+
+    cols =
+      df
+      |> to_existing_columns(opts[:columns])
+      |> Enum.filter(fn name -> numeric_column?(df, name) end)
+
+    out_dtypes = for col <- cols, into: %{column_name => :string}, do: {col, {:f, 64}}
+    out_df = %{df | dtypes: out_dtypes, names: [column_name | cols]}
+
+    Shared.apply_impl(df, :correlation, [out_df, opts[:ddof]])
+  end
+
+  defp numeric_column?(df, name) do
+    Series.dtype(df[name]) in [:integer | Explorer.Shared.float_types()]
+  end
+
   # Helpers
 
   defp backend_from_options!(opts) do

diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex
@@ -764,6 +764,26 @@ defmodule Explorer.PolarsBackend.DataFrame do
     Shared.apply_dataframe(df, out_df, :df_unnest, [columns])
   end
 
+  @impl true
+  def correlation(df, out_df, ddof) do
+    [column_name | cols] = out_df.names
+
+    correlations =
+      Enum.map(cols, fn left ->
+        corr_series =
+          cols
+          |> Enum.map(fn right -> PolarsSeries.correlation(df[left], df[right], ddof) end)
+          |> Shared.from_list({:f, 64})
+          |> Shared.create_series()
+
+        {left, corr_series}
+      end)
+
+    names_series = cols |> Shared.from_list(:string) |> Shared.create_series()
+
+    from_series([{column_name, names_series} | correlations])
+  end
+
   # Two or more table verbs
 
   @impl true

diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex
@@ -488,6 +488,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
   end
 
   not_available_funs = [
+    correlation: 3,
     describe: 2,
     nil_count: 1,
     dummies: 3,

diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs
@@ -3855,4 +3855,59 @@ defmodule Explorer.DataFrameTest do
                    fn -> DF.unnest(df, [:a, :b]) end
     end
   end
+
+  describe "correlation/2" do
+    test "two integer columns" do
+      df = DF.new(dogs: [1, 8, 3], cats: [4, 5, 2])
+      df1 = DF.correlation(df)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               names: ["dogs", "cats"],
+               dogs: [1.0000000000000002, 0.5447047794019219],
+               cats: [0.5447047794019219, 1.0]
+             }
+    end
+
+    test "three integer columns and custom column name" do
+      df = DF.new(dogs: [1, 2, 3], cats: [3, 2, 1], frogs: [7, 8, 9])
+      df1 = DF.correlation(df, column_name: "variables")
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               variables: ["dogs", "cats", "frogs"],
+               dogs: [1.0, -1.0, 1.0],
+               cats: [-1.0, 1.0, -1.0],
+               frogs: [1.0, -1.0, 1.0]
+             }
+    end
+
+    test "two float columns" do
+      df = DF.new(dogs: [1.4, 8.6, 3.7], cats: [4.1, 5.3, 2.2])
+      df1 = DF.correlation(df)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               names: ["dogs", "cats"],
+               dogs: [0.9999999999999999, 0.5642328261411999],
+               cats: [0.5642328261411999, 0.9999999999999998]
+             }
+    end
+
+    test "one column" do
+      df = DF.new(cats: [4, 5, 2])
+      df1 = DF.correlation(df)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               names: ["cats"],
+               cats: [1.0]
+             }
+    end
+
+    test "no numeric columns" do
+      df = DF.new(cats: ["susie", "tuka", "tobias", "terror"])
+      df1 = DF.correlation(df)
+
+      assert DF.to_columns(df1, atom_keys: true) == %{
+               names: []
+             }
+    end
+  end
 end