Skip to content

Commit

Permalink
perf(eda.correlation): increase the performance
Browse files Browse the repository at this point in the history
  • Loading branch information
dovahcrow committed Sep 4, 2020
1 parent 2735787 commit 3575aac
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 12 deletions.
11 changes: 9 additions & 2 deletions dataprep/eda/correlation/compute/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def rankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
name="rankdata-bottleneck", pure=True
)
def nanrankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
"""delayed version of rankdata"""
"""delayed version of rankdata."""
return nanrankdata_(data, axis=axis)


Expand All @@ -38,6 +38,13 @@ def nanrankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
def kendalltau( # pylint: disable=invalid-name
a: np.ndarray, b: np.ndarray
) -> np.ndarray:
"""delayed version of kendalltau"""
"""delayed version of kendalltau."""
corr = kendalltau_(a, b).correlation
return np.float64(corr) # Sometimes corr is a float, causes dask error


@dask.delayed
def corrcoef(arr: np.ndarray) -> np.ndarray:
"""delayed version of np.corrcoef."""
_, (corr, _) = np.corrcoef(arr, rowvar=False)
return corr
20 changes: 10 additions & 10 deletions dataprep/eda/correlation/compute/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
import numpy as np
import pandas as pd

from ...intermediate import Intermediate
from ...data_array import DataArray
from .common import CorrelationMethod, kendalltau, nanrankdata
from ...intermediate import Intermediate
from .common import CorrelationMethod, kendalltau, nanrankdata, corrcoef


def _calc_univariate(
Expand Down Expand Up @@ -74,17 +74,17 @@ def _calc_univariate(
def _pearson_1xn(x: da.Array, data: da.Array) -> da.Array:
_, ncols = data.shape

datamask = da.isnan(data)
xmask = da.isnan(x)[:, 0]
fused = da.concatenate([data, x], axis=1)
mask = ~da.isnan(data)

corrs = []
for j in range(ncols):
y = data[:, [j]]

mask = ~(xmask | datamask[:, j])
xy = np.concatenate([x, y], axis=1)[mask]
xy.compute_chunk_sizes() # Not optimal here
_, (corr, _) = da.corrcoef(xy, rowvar=False)
xy = fused[:, [-1, j]]
mask_ = mask[:, -1] & mask[:, j]
xy = xy[mask_]
corr = da.from_delayed(corrcoef(xy), dtype=np.float, shape=())
# not usable because xy has unknown rows due to the null filter
# _, (corr, _) = da.corrcoef(xy, rowvar=False)
corrs.append(corr)

return da.stack(corrs)
Expand Down

0 comments on commit 3575aac

Please sign in to comment.