Skip to content

Commit

Permalink
fix(eda):fix value table display
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Oct 12, 2021
1 parent ce25b17 commit 57281bc
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 12 deletions.
17 changes: 6 additions & 11 deletions dataprep/eda/distribution/compute/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,10 @@ def nom_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
srs = srs.dropna() # drop null values
grps = srs.value_counts(sort=False) # counts of unique values in the series
data["geo"] = grps

if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable:
data["nuniq"] = grps.shape[0] # total number of groups
data["nuniq"] = grps.shape[0] # total number of groups

# compute bar and pie together unless the parameters are different
if cfg.bar.enable or cfg.pie.enable:
if cfg.bar.enable or cfg.pie.enable or cfg.value_table.enable:
# select the largest or smallest groups
data["bar"] = (
grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars)
Expand All @@ -138,7 +136,7 @@ def nom_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
data["chisq"] = chisquare(grps.values)

df = grps.reset_index() # dataframe with group names and counts
if cfg.stats.enable:
if cfg.stats.enable or cfg.value_table.enable:
data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"]))
elif cfg.wordfreq.enable and cfg.insight.enable:
data["len_stats"] = {"Minimum": srs.str.len().min(), "Maximum": srs.str.len().max()}
Expand Down Expand Up @@ -190,11 +188,9 @@ def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
# pylint: disable=too-many-branches
data: Dict[str, Any] = {}

if cfg.stats.enable or cfg.hist.enable:
data["nrows"] = srs.shape[0] # total rows
data["nrows"] = srs.shape[0] # total rows
srs = srs.dropna()
if cfg.stats.enable:
data["npres"] = srs.shape[0] # number of present (not null) values
data["npres"] = srs.shape[0] # number of present (not null) values
srs = srs[~srs.isin({np.inf, -np.inf})] # remove infinite values
if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable:
data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
Expand Down Expand Up @@ -237,8 +233,7 @@ def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
data.update(_calc_box(srs, data["qntls"], cfg))
if cfg.value_table.enable:
value_counts = srs.value_counts(sort=False)
if cfg.stats.enable:
data["nuniq"] = value_counts.shape[0]
data["nuniq"] = value_counts.shape[0]
data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups)
elif cfg.stats.enable:
data["nuniq"] = srs.nunique_approx()
Expand Down
6 changes: 5 additions & 1 deletion dataprep/eda/distribution/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -1619,6 +1619,7 @@ def render_cat(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:

if cfg.value_table.enable:
htgs["Value Table"] = cfg.value_table.how_to_guide()
stats = data["stats"]
value_table = _value_table(
data["value_table"], stats["nrows"], stats["npres"], stats["nuniq"]
)
Expand All @@ -1628,7 +1629,10 @@ def render_cat(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
# panel.child.children[0] is a figure
for panel in tabs[0:]:
panel.child.children[0].frame_width = int(plot_width * 0.9)
tabs[0].child.children[0].frame_width = int(plot_width_bar * 0.9)

if len(tabs) > 0:
tabs[0].child.children[0].frame_width = int(plot_width_bar * 0.9)

return {
"tabledata": format_cat_stats(stats, len_stats, letter_stats) if cfg.stats.enable else [],
"value_table": value_table,
Expand Down
7 changes: 7 additions & 0 deletions dataprep/tests/eda/test_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ...eda import plot
from ...eda.dtypes_v2 import Nominal, LatLong
from ...eda.utils import to_dask
from ...datasets import load_dataset
from .random_data_generator import random_df

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -98,3 +99,9 @@ def test_geo(geodf: dd.DataFrame) -> None:

def test_random_df(random_df: pd.DataFrame) -> None:
plot(random_df)


def test_plot_titanic() -> None:
df = load_dataset("titanic")
plot(df, "Sex", display=["Value Table"])
plot(df, "Age", display=["Value Table"])

1 comment on commit 57281bc

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataPrep.EDA Benchmarks

Benchmark suite Current: 57281bc Previous: ce25b17 Ratio
dataprep/tests/benchmarks/eda.py::test_create_report 0.1544116427393042 iter/sec (stddev: 0.099028250854475) 0.16245631136579736 iter/sec (stddev: 0.16965107827422618) 1.05

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.