Skip to content

Commit

Permalink
feat(eda): enrich plot_correlation
Browse files Browse the repository at this point in the history
  • Loading branch information
yuzhenmao committed Nov 19, 2020
1 parent d5cc7bd commit 29c444e
Show file tree
Hide file tree
Showing 8 changed files with 3,828 additions and 69 deletions.
6 changes: 5 additions & 1 deletion dataprep/eda/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
elif "_column" in visual_type or visual_type in (
"missing_impact",
"missing_impact_1v1",
"correlation_impact",
):
# todo: param management
if to_render.get("tabledata"):
Expand All @@ -61,7 +62,10 @@ def __init__(
"title": "DataPrep.EDA Report",
"rnd": random.randint(100, 999), # for multiple cells running in the same notebook
}
self.template_base = ENV_LOADER.get_template("tab_base.html")
if visual_type == "correlation_impact":
self.template_base = ENV_LOADER.get_template("tab_base_corr.html")
else:
self.template_base = ENV_LOADER.get_template("tab_base.html")
else:
raise TypeError(f"Unsupported Visual Type: {visual_type}.")

Expand Down
12 changes: 8 additions & 4 deletions dataprep/eda/correlation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ..report import Report
from .compute import compute_correlation
from .render import render_correlation
from ..container import Container

__all__ = ["render_correlation", "compute_correlation", "plot_correlation"]

Expand All @@ -23,7 +24,7 @@ def plot_correlation(
value_range: Optional[Tuple[float, float]] = None,
k: Optional[int] = None,
progress: bool = True,
) -> Report:
) -> Union[Report, Container]:
"""
This function is designed to calculate the correlation between columns
There are three functions: plot_correlation(df), plot_correlation(df, x)
Expand Down Expand Up @@ -65,7 +66,10 @@ def plot_correlation(
and it is better to drop None, Nan and Null value before using it
"""
with ProgressBar(minimum=1, disable=not progress):
intermediate = compute_correlation(df, x=x, y=y, value_range=value_range, k=k)
figure = render_correlation(intermediate)
itmdt = compute_correlation(df, x=x, y=y, value_range=value_range, k=k)
fig = render_correlation(itmdt)

return Report(figure)
if itmdt.visual_type == "correlation_impact" or "_column" in itmdt.visual_type:
return Container(fig, itmdt.visual_type)
else:
return Report(fig)
154 changes: 152 additions & 2 deletions dataprep/eda/correlation/compute/nullivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Currently this boils down to pandas' implementation."""

from functools import partial
from typing import Dict, Optional, Tuple
from typing import Dict, Optional, Tuple, List, Any

import dask
import dask.array as da
Expand All @@ -13,6 +13,7 @@
from ...data_array import DataArray
from ...intermediate import Intermediate
from .common import CorrelationMethod
from ...utils import cut_long_name


def _calc_nullivariate(
Expand All @@ -21,6 +22,10 @@ def _calc_nullivariate(
value_range: Optional[Tuple[float, float]] = None,
k: Optional[int] = None,
) -> Intermediate:
# pylint: disable=too-many-statements,too-many-locals,too-many-branches

most_show = 6 # the most number of column/row to show in "insight"
# longest = 5 # the longest length of word to show in "insight"

if value_range is not None and k is not None:
raise ValueError("value_range and k cannot be present in both")
Expand All @@ -31,6 +36,38 @@ def _calc_nullivariate(
# So we do them in pandas

(corrs,) = dask.compute(corrs)
pearson_corr, spearman_corr, kendalltau_corr = corrs.values()

pearson_pos_max, pearson_neg_max, pearson_mean, pearson_pos_cols, pearson_neg_cols = most_corr(
pearson_corr
)
(
spearman_pos_max,
spearman_neg_max,
spearman_mean,
spearman_pos_cols,
spearman_neg_cols,
) = most_corr(spearman_corr)
(
kendalltau_pos_max,
kendalltau_neg_max,
kendalltau_mean,
kendalltau_pos_cols,
kendalltau_neg_cols,
) = most_corr(kendalltau_corr)
pearson_min, pearson_cols = least_corr(pearson_corr)
spearman_min, spearman_cols = least_corr(spearman_corr)
kendalltau_min, kendalltau_cols = least_corr(kendalltau_corr)

p_p_corr = create_string("positive", pearson_pos_cols, most_show, df)
s_p_corr = create_string("positive", spearman_pos_cols, most_show, df)
k_p_corr = create_string("positive", kendalltau_pos_cols, most_show, df)
p_n_corr = create_string("negative", pearson_neg_cols, most_show, df)
s_n_corr = create_string("negative", spearman_neg_cols, most_show, df)
k_n_corr = create_string("negative", kendalltau_neg_cols, most_show, df)
p_corr = create_string("least", pearson_cols, most_show, df)
s_corr = create_string("least", spearman_cols, most_show, df)
k_corr = create_string("least", kendalltau_cols, most_show, df)

dfs = {}
for method, corr in corrs.items():
Expand All @@ -55,7 +92,34 @@ def _calc_nullivariate(
return Intermediate(
data=dfs,
axis_range=list(df.columns.unique()),
visual_type="correlation_heatmaps",
visual_type="correlation_impact",
tabledata={
"Highest Positive Correlation": {
"Pearson": pearson_pos_max,
"Spearman": spearman_pos_max,
"KendallTau": kendalltau_pos_max,
},
"Highest Negative Correlation": {
"Pearson": pearson_neg_max,
"Spearman": spearman_neg_max,
"KendallTau": kendalltau_neg_max,
},
"Lowest Correlation": {
"Pearson": pearson_min,
"Spearman": spearman_min,
"KendallTau": kendalltau_min,
},
"Mean Correlation": {
"Pearson": pearson_mean,
"Spearman": spearman_mean,
"KendallTau": kendalltau_mean,
},
},
insights={
"Pearson": [p_p_corr, p_n_corr, p_corr],
"Spearman": [s_p_corr, s_n_corr, s_corr],
"KendallTau": [k_p_corr, k_n_corr, k_corr],
},
)


Expand Down Expand Up @@ -110,6 +174,92 @@ def _kendall_tau_nxn(df: DataArray) -> da.Array:
)


def most_corr(corrs: np.ndarray) -> Tuple[float, float, float, List[Any], List[Any]]:
"""Find the most correlated columns."""
positive_col_set = set()
negative_col_set = set()
corrs_copy = corrs
for i in range(corrs_copy.shape[0]):
corrs_copy[i, i] = 0
mean = corrs_copy.mean()
p_maximum = corrs_copy.max()
n_maximum = (-corrs_copy).max()

if p_maximum != 0:
p_col1, p_col2 = np.where(corrs_copy == p_maximum)
else:
p_col1, p_col2 = [], []
if n_maximum != 0:
n_col1, n_col2 = np.where(corrs_copy == -n_maximum)
else:
n_col1, n_col2 = [], []

for i, _ in enumerate(p_col1):
if p_col1[i] < p_col2[i]:
positive_col_set.add((p_col1[i], p_col2[i]))
elif p_col1[i] > p_col2[i]:
positive_col_set.add((p_col2[i], p_col1[i]))
for i, _ in enumerate(n_col1):
if n_col1[i] < n_col2[i]:
negative_col_set.add((n_col1[i], n_col2[i]))
elif n_col1[i] > n_col2[i]:
negative_col_set.add((n_col2[i], n_col1[i]))

return (
round(p_maximum, 3),
round(-n_maximum, 3),
round(mean, 3),
list(positive_col_set),
list(negative_col_set),
)


def least_corr(corrs: np.ndarray) -> Tuple[float, List[Any]]:
"""Find the least correlated columns."""
col_set = set()
corrs_copy = corrs
for i in range(corrs_copy.shape[0]):
corrs_copy[i, i] = 2
minimum = abs(corrs_copy).min()
col1, col2 = np.where(corrs_copy == minimum)

for i, _ in enumerate(col1):
if col1[i] < col2[i]:
col_set.add((col1[i], col2[i]))
elif col1[i] > col2[i]:
col_set.add((col2[i], col1[i]))

return round(minimum, 3), list(col_set)


def create_string(flag: str, source: List[Any], most_show: int, df: DataArray) -> str:
"""Create the output string"""
suffix = "" if len(source) <= most_show else ", ..."
if flag == "positive":
prefix = "Most positive correlated: "
temp = "Most positive correlated: None"
elif flag == "negative":
prefix = "Most negative correlated: "
temp = "Most negative correlated: None"
elif flag == "least":
prefix = "Least correlated: "
temp = "Least correlated: None"

if source != []:
out = (
prefix
+ ", ".join(
"(" + cut_long_name(df.columns[e[0]]) + ", " + cut_long_name(df.columns[e[1]]) + ")"
for e in source[:most_show]
)
+ suffix
)
else:
out = temp

return out


## The code below is the correlation algorithms for array. Since we don't have
## block-wise algorithms for spearman and kendalltal, it might be more suitable
## to just use the pandas version of correlation.
Expand Down
59 changes: 58 additions & 1 deletion dataprep/eda/correlation/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
This module implements the visualization for
plot_correlation(df) function
"""
from typing import List, Optional, Sequence, Tuple
from typing import List, Optional, Sequence, Tuple, Any, Dict

import numpy as np
from bokeh.layouts import column, row
Expand Down Expand Up @@ -58,6 +58,8 @@ def render_correlation(
"""
if itmdt.visual_type is None:
visual_elem = Figure()
elif itmdt.visual_type == "correlation_impact":
visual_elem = render_correlation_impact(itmdt, plot_width, plot_height, palette or RDBU)
elif itmdt.visual_type == "correlation_heatmaps":
visual_elem = render_correlation_heatmaps(itmdt, plot_width, plot_height, palette or RDBU)
elif itmdt.visual_type == "correlation_single_heatmaps":
Expand Down Expand Up @@ -123,6 +125,61 @@ def tweak_figure(fig: Figure) -> None:
fig.yaxis.formatter = FuncTickFormatter(code=format_js)


def render_correlation_impact(
itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str]
) -> Dict[str, Any]:
"""
Render correlation heatmaps in to tabs
"""
tabs: List[Panel] = []
tooltips = [("x", "@x"), ("y", "@y"), ("correlation", "@correlation{1.11}")]
axis_range = itmdt["axis_range"]

for method, df in itmdt["data"].items():
# in case of numerical column names
df = df.copy()
df["x"] = df["x"].apply(str)
df["y"] = df["y"].apply(str)

mapper, color_bar = create_color_mapper(palette)
x_range = FactorRange(*axis_range)
y_range = FactorRange(*reversed(axis_range))
fig = Figure(
x_range=x_range,
y_range=y_range,
plot_width=plot_width,
plot_height=plot_height,
x_axis_location="below",
tools="hover",
toolbar_location=None,
tooltips=tooltips,
background_fill_color="#fafafa",
)

tweak_figure(fig)

fig.rect(
x="x",
y="y",
width=1,
height=1,
source=df,
fill_color={"field": "correlation", "transform": mapper},
line_color=None,
)

fig.add_layout(color_bar, "right")
tab = Panel(child=fig, title=method)
tabs.append(tab)

return {
"insights": itmdt["insights"],
"tabledata": itmdt["tabledata"],
"layout": [panel.child for panel in tabs],
"meta": [panel.title for panel in tabs],
}


def render_correlation_heatmaps(
itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str]
) -> Tabs:
Expand Down
17 changes: 5 additions & 12 deletions dataprep/eda/missing/compute/nullivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pandas as pd
from dask import delayed
from scipy.cluster import hierarchy
from ...utils import cut_long_name

from ...data_array import DataArray
from ...intermediate import Intermediate
Expand All @@ -21,7 +22,7 @@ def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, An
# pylint: disable=too-many-locals

most_show = 5 # the most number of column/row to show in "insight"
longest = 5 # the longest length of word to show in "insight"
# longest = 5 # the longest length of word to show in "insight"

df.compute()

Expand Down Expand Up @@ -78,18 +79,18 @@ def _compute_missing_nullivariate(df: DataArray, bins: int) -> Generator[Any, An

top_miss_col = (
str(most_col[0])
+ "-col(s) "
+ " col(s): "
+ str(
"("
+ ", ".join(abbr(df.columns[e], longest) for e in most_col[2][:most_show])
+ ", ".join(cut_long_name(df.columns[e]) for e in most_col[2][:most_show])
+ suffix_col
+ ")"
)
)

top_miss_row = (
str(most_row[0])
+ "-row(s) "
+ " row(s): "
+ str("(" + ", ".join(str(e) for e in most_row[2][:most_show]) + suffix_row + ")")
)

Expand Down Expand Up @@ -299,11 +300,3 @@ def missing_most_row(df: DataArray) -> Tuple[int, float, List[Any]]:
rst = da.where(row_sum == maximum)[0]

return cnt, rate, rst


def abbr(name: str, longest: int) -> str:
"""Cut the name if it is too long."""
if len(name) > longest:
return str(name[0:longest] + "...")
else:
return name
8 changes: 4 additions & 4 deletions dataprep/eda/templates/tab_base.html
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,13 @@ <h3>{{ title }}</h3>
<input type='checkbox' style='display: none' id="ib-{{ context.rnd }}-{{ loop.index0 }}" class="insight-check-{{ context.rnd }}">
<label class="insight-btn-{{ context.rnd }}" for="ib-{{ context.rnd }}-{{ loop.index0 }}"></label>
<div class="insight-panel-{{ context.rnd }}">
<ol>
<ul>
{% for insight in context.insights[context.meta[loop.index]] %}
<li class="entry-{{ context.rnd }}"><span
class="col-name-{{ context.rnd }}">{{ insight.split(' ')[0] }}</span>{{ insight.replace(insight.split(' ')[0], '') }}
<li class="entry-{{ context.rnd }}">
{{ insight }}
</li>
{% endfor %}
</ol>
</ul>
</div>
</div>
{% endif %}
Expand Down
Loading

0 comments on commit 29c444e

Please sign in to comment.