perf(eda.correlation): increase the performance

sfu-db · Sep 4, 2020 · 3575aac · 3575aac
1 parent 2735787
commit 3575aac
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 12 deletions.
diff --git a/dataprep/eda/correlation/compute/common.py b/dataprep/eda/correlation/compute/common.py
@@ -28,7 +28,7 @@ def rankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
     name="rankdata-bottleneck", pure=True
 )
 def nanrankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
-    """delayed version of rankdata"""
+    """delayed version of rankdata."""
     return nanrankdata_(data, axis=axis)
 
 
@@ -38,6 +38,13 @@ def nanrankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
 def kendalltau(  # pylint: disable=invalid-name
     a: np.ndarray, b: np.ndarray
 ) -> np.ndarray:
-    """delayed version of kendalltau"""
+    """delayed version of kendalltau."""
     corr = kendalltau_(a, b).correlation
     return np.float64(corr)  # Sometimes corr is a float, causes dask error
+
+
+@dask.delayed
+def corrcoef(arr: np.ndarray) -> np.ndarray:
+    """delayed version of np.corrcoef."""
+    _, (corr, _) = np.corrcoef(arr, rowvar=False)
+    return corr
diff --git a/dataprep/eda/correlation/compute/univariate.py b/dataprep/eda/correlation/compute/univariate.py
@@ -9,9 +9,9 @@
 import numpy as np
 import pandas as pd
 
-from ...intermediate import Intermediate
 from ...data_array import DataArray
-from .common import CorrelationMethod, kendalltau, nanrankdata
+from ...intermediate import Intermediate
+from .common import CorrelationMethod, kendalltau, nanrankdata, corrcoef
 
 
 def _calc_univariate(
@@ -74,17 +74,17 @@ def _calc_univariate(
 def _pearson_1xn(x: da.Array, data: da.Array) -> da.Array:
     _, ncols = data.shape
 
-    datamask = da.isnan(data)
-    xmask = da.isnan(x)[:, 0]
+    fused = da.concatenate([data, x], axis=1)
+    mask = ~da.isnan(data)
 
     corrs = []
     for j in range(ncols):
-        y = data[:, [j]]
-
-        mask = ~(xmask | datamask[:, j])
-        xy = np.concatenate([x, y], axis=1)[mask]
-        xy.compute_chunk_sizes()  # Not optimal here
-        _, (corr, _) = da.corrcoef(xy, rowvar=False)
+        xy = fused[:, [-1, j]]
+        mask_ = mask[:, -1] & mask[:, j]
+        xy = xy[mask_]
+        corr = da.from_delayed(corrcoef(xy), dtype=np.float, shape=())
+        # not usable because xy has unknown rows due to the null filter
+        # _, (corr, _) = da.corrcoef(xy, rowvar=False)
         corrs.append(corr)
 
     return da.stack(corrs)