Skip to content

Commit

Permalink
feat(eda): Redesigned layout for plot_missing
Browse files Browse the repository at this point in the history
  • Loading branch information
eutialia committed Oct 6, 2020
1 parent 8b274b9 commit c85eaa5
Show file tree
Hide file tree
Showing 17 changed files with 164 additions and 81 deletions.
Binary file modified assets/plot_missing(df).gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified assets/plot_missing(df, x).gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
36 changes: 21 additions & 15 deletions dataprep/eda/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,46 +16,52 @@
output_notebook(INLINE, hide_banner=True) # for offline usage

ENV_LOADER = Environment(
loader=PackageLoader("dataprep", "eda/distribution/templates"),
loader=PackageLoader("dataprep", "eda/templates"),
)


class Container:
"""
This class creates a customized Container object for the plot(df) function.
This class creates a customized Container object for the plot* function.
"""

def __init__(
self,
to_render: Dict[str, Any],
visual_type: str,
) -> None:
if visual_type == "distribution_grid":
if visual_type in ("distribution_grid", "missing_impact_1vn"):
self.context = {
"resources": INLINE.render(),
"components": components(to_render["layout"]),
"tabledata": to_render["tabledata"],
"overview_insights": to_render["overview_insights"],
"column_insights": to_render["column_insights"],
"meta": to_render["meta"],
"components": components(to_render.get("layout")),
"tabledata": to_render.get("tabledata"),
"overview_insights": to_render.get("overview_insights"),
"column_insights": to_render.get("column_insights"),
"meta": to_render.get("meta"),
"title": "DataPrep.EDA Report",
"rnd": random.randint(0, 99), # for multiple cells running in the same notebook
"container_width": f"{to_render['fig_width']*3}px",
"legend_labels": to_render.get("legend_labels"),
}
self.template_base = ENV_LOADER.get_template("grid_base.html")

elif "_column" in visual_type:
elif "_column" in visual_type or visual_type in (
"missing_impact",
"missing_impact_1v1",
):
# todo: param management
to_render["meta"].insert(0, "Stats")
if to_render.get("tabledata"):
to_render["meta"].insert(0, "Stats")
self.context = {
"resources": INLINE.render(),
"tabledata": to_render["tabledata"],
"insights": to_render["insights"],
"components": components(to_render["layout"]),
"meta": to_render["meta"],
"tabledata": to_render.get("tabledata"),
"insights": to_render.get("insights"),
"components": components(to_render.get("layout")),
"meta": to_render.get("meta"),
"title": "DataPrep.EDA Report",
"rnd": random.randint(100, 999), # for multiple cells running in the same notebook
}
self.template_base = ENV_LOADER.get_template("univariate_base.html")
self.template_base = ENV_LOADER.get_template("tab_base.html")
else:
raise TypeError(f"Unsupported Visual Type: {visual_type}.")

Expand Down
6 changes: 3 additions & 3 deletions dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@ def format_basic(df: dd.DataFrame) -> Dict[str, Any]:
# missing
res["has_missing"] = True
itmdt = completions["miss"](data["miss"])
rndrd = render_missing(itmdt)

rndrd = render_missing(itmdt)["layout"]
figs.clear()
for tab in rndrd.tabs:
fig = tab.child.children[0]
for fig in rndrd:
fig.sizing_mode = "stretch_width"
fig.title = Title(text=tab.title, align="center", text_font_size="20px")
figs.append(fig)
Expand Down
12 changes: 2 additions & 10 deletions dataprep/eda/distribution/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -1421,16 +1421,7 @@ def stats_viz_dt(stats: Dict[str, Any]) -> Dict[str, Dict[str, str]]:

def render_distribution_grid(
itmdt: Intermediate, yscale: str, plot_width: int, plot_height: int
) -> Dict[
str,
Union[
List[str],
List[Figure],
Tuple[Dict[str, str], Dict[str, str]],
Dict[int, List[str]],
Dict[str, List[Union[str, int]]],
],
]:
) -> Dict[str, Any]:
"""
Render plots and dataset stats from plot(df)
""" # pylint: disable=too-many-locals
Expand Down Expand Up @@ -1467,6 +1458,7 @@ def render_distribution_grid(
"tabledata": format_ov_stats(itmdt["stats"]),
"overview_insights": itmdt["overview_insights"],
"column_insights": itmdt["column_insights"],
"fig_width": plot_width,
}


Expand Down
8 changes: 5 additions & 3 deletions dataprep/eda/missing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ..report import Report
from .compute import compute_missing
from .render import render_missing
from ..container import Container

__all__ = ["render_missing", "compute_missing", "plot_missing"]

Expand All @@ -21,11 +22,11 @@ def plot_missing(
x: Optional[str] = None,
y: Optional[str] = None,
*,
bins: int = 30,
bins: int = 20,
ndist_sample: int = 100,
dtype: Optional[DTypeDef] = None,
progress: bool = True,
) -> Report:
) -> Container:
"""
This function is designed to deal with missing values
There are three functions: plot_missing(df), plot_missing(df, x)
Expand Down Expand Up @@ -63,4 +64,5 @@ def plot_missing(
with ProgressBar(minimum=1, disable=not progress):
itmdt = compute_missing(df, x, y, dtype=dtype, bins=bins, ndist_sample=ndist_sample)
fig = render_missing(itmdt)
return Report(fig)

return Container(fig, itmdt.visual_type)
2 changes: 1 addition & 1 deletion dataprep/eda/missing/compute/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
is_dtype,
)

LABELS = ["With Missing", "Missing Dropped"]
LABELS = ["With Missing Remaining", "With Missing Dropped"]


def uni_histogram(
Expand Down
113 changes: 68 additions & 45 deletions dataprep/eda/missing/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
This module implements the plot_missing(df, x, y) function's
visualization part.
"""
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
from typing import Any, Dict, List, Optional, Sequence, Tuple, Set

import numpy as np
import pandas as pd
Expand All @@ -16,13 +16,11 @@
FactorRange,
FuncTickFormatter,
HoverTool,
LayoutDOM,
LinearColorMapper,
NumeralTickFormatter,
Panel,
PrintfTickFormatter,
Range1d,
Tabs,
Title,
)
from bokeh.plotting import Figure
Expand All @@ -41,14 +39,14 @@ def render_missing(
itmdt: Intermediate,
plot_width: int = 500,
plot_height: int = 500,
) -> LayoutDOM:
) -> Dict[str, Any]:
"""
@Jinglin write here
"""
if itmdt.visual_type == "missing_impact":
return render_missing_impact(itmdt, plot_width, plot_height)
elif itmdt.visual_type == "missing_impact_1vn":
return render_missing_impact_1vn(itmdt, plot_width, plot_height)
return render_missing_impact_1vn(itmdt, plot_width - 176, plot_height - 200)
elif itmdt.visual_type == "missing_impact_1v1":
return render_missing_impact_1v1(itmdt, plot_width, plot_height)
else:
Expand All @@ -65,6 +63,20 @@ def tweak_figure(fig: Figure) -> Figure:
fig.axis.major_label_text_font_size = "9pt"
fig.axis.major_label_standoff = 0
fig.xaxis.major_label_orientation = np.pi / 3
# truncate axis tick values
format_js = """
if (tick.toString().length > 15) {
if (typeof tick === 'string') {
return tick.toString().substring(0, 13) + '...';
} else {
return tick.toPrecision(1);
}
} else {
return tick;
}
"""
fig.xaxis.formatter = FuncTickFormatter(code=format_js)
fig.yaxis.formatter = FuncTickFormatter(code=format_js)

return fig

Expand Down Expand Up @@ -114,12 +126,13 @@ def render_dist(
return fig


def render_hist(
def render_hist( # pylint: disable=too-many-arguments
df: pd.DataFrame,
x: str,
meta: ColumnMetadata,
plot_width: int,
plot_height: int,
show_legend: bool,
) -> Figure:
"""
Render a histogram
Expand Down Expand Up @@ -167,18 +180,27 @@ def render_hist(
tooltips=tooltips,
)
)
if show_legend:
fig.vbar(
x="x",
width=radius,
top="count",
source=df,
fill_alpha=0.3,
color={"field": "label", "transform": cmapper},
legend_field="label",
)

fig.vbar(
x="x",
width=radius,
top="count",
source=df,
fill_alpha=0.3,
color={"field": "label", "transform": cmapper},
legend_field="label",
)

relocate_legend(fig, "right")
relocate_legend(fig, "right")
else:
fig.vbar(
x="x",
width=radius,
top="count",
source=df,
fill_alpha=0.3,
color={"field": "label", "transform": cmapper},
)

return fig

Expand Down Expand Up @@ -284,7 +306,9 @@ def create_color_mapper_heatmap(
return mapper, colorbar


def render_missing_impact(itmdt: Intermediate, plot_width: int, plot_height: int) -> Tabs:
def render_missing_impact(
itmdt: Intermediate, plot_width: int, plot_height: int
) -> Dict[str, List[Any]]:
"""
Render correlation heatmaps in to tabs
"""
Expand All @@ -303,8 +327,10 @@ def render_missing_impact(itmdt: Intermediate, plot_width: int, plot_height: int
fig_dendrogram = render_dendrogram(itmdt["data_dendrogram"], plot_width, plot_height)
tabs.append(Panel(child=row(fig_dendrogram), title="Dendrogram"))

tabs = Tabs(tabs=tabs)
return tabs
return {
"layout": [panel.child.children[0] for panel in tabs],
"meta": [panel.title for panel in tabs],
}


def render_heatmaps(df: Optional[pd.DataFrame], plot_width: int, plot_height: int) -> Figure:
Expand Down Expand Up @@ -371,12 +397,6 @@ def empty_figure() -> Figure:
fill_color={"field": "correlation", "transform": mapper},
line_color=None,
)
format_js = """
if (tick.length > 15) return tick.substring(0, 13) + '...';
else return tick;
"""
fig.xaxis.formatter = FuncTickFormatter(code=format_js)
fig.yaxis.formatter = FuncTickFormatter(code=format_js)
else:
fig = empty_figure()
else:
Expand Down Expand Up @@ -452,12 +472,6 @@ def render_bar_chart(
)
fig.add_tools(hover)

format_js = """
if (tick.length > 18) return tick.substring(0, 16) + '...';
else return tick;
"""
fig.xaxis.formatter = FuncTickFormatter(code=format_js)

fig.yaxis.axis_label = "Row Count"
tweak_figure(fig)
relocate_legend(fig, "right")
Expand Down Expand Up @@ -582,35 +596,42 @@ def render_missing_impact_1vn(
itmdt: Intermediate,
plot_width: int,
plot_height: int,
) -> Tabs:
) -> Dict[str, Any]:
"""
Render the plot from `plot_missing(df, "x")`
"""

dfs = itmdt["data"]
x = itmdt["x"]
meta = itmdt["meta"]

legend_set: Set[str] = set()
panels = []
for col, df in dfs.items():
fig = render_hist(df, col, meta[col], plot_width, plot_height)
fig = render_hist(df, col, meta[col], plot_width, plot_height, False)
shown, total = meta[col]["partial"]

fig.frame_height = plot_height
if shown != total:
fig.title = Title(text=f"Missing impact of {x} by ({shown} out of {total}) {col}")
else:
fig.title = Title(text=f"Missing impact of {x} by {col}")
panels.append(Panel(child=fig, title=col))

tabs = Tabs(tabs=panels)
return tabs
legend_set = legend_set.union(set(df["label"].drop_duplicates().to_list()))
legend_labels = list(legend_set)
legend_colors = [CATEGORY10[count] for count in range(len(legend_labels))]
return {
"layout": [panel.child for panel in panels],
"fig_width": plot_width,
"legend_labels": [
{"label": label, "color": color} for label, color in zip(legend_labels, legend_colors)
],
}


def render_missing_impact_1v1(
itmdt: Intermediate,
plot_width: int,
plot_height: int,
) -> Union[Tabs, Figure]:
) -> Dict[str, List[Any]]:
"""
Render the plot from `plot_missing(df, "x", "y")`
"""
Expand All @@ -620,7 +641,7 @@ def render_missing_impact_1v1(
if is_dtype(meta["dtype"], Continuous()):
panels = []

fig = render_hist(itmdt["hist"], y, meta, plot_width, plot_height)
fig = render_hist(itmdt["hist"], y, meta, plot_width, plot_height, True)
panels.append(Panel(child=fig, title="Histogram"))

fig = render_dist(itmdt["dist"], y, "pdf", plot_width, plot_height)
Expand All @@ -632,14 +653,16 @@ def render_missing_impact_1v1(
fig = render_boxwhisker(itmdt["box"], plot_width, plot_height)
panels.append(Panel(child=fig, title="Box"))

tabs = Tabs(tabs=panels)
return tabs
return {
"layout": [panel.child for panel in panels],
"meta": [panel.title for panel in panels],
}
else:
fig = render_hist(itmdt["hist"], y, meta, plot_width, plot_height)
fig = render_hist(itmdt["hist"], y, meta, plot_width, plot_height, True)

shown, total = meta["partial"]
if shown != total:
fig.title = Title(text=f"Missing impact of {x} by ({shown} out of {total}) {y}")
else:
fig.title = Title(text=f"Missing impact of {x} by {y}")
return fig
return {"layout": [fig], "meta": [fig.title.text]}
Loading

0 comments on commit c85eaa5

Please sign in to comment.