Skip to content

Commit

Permalink
fix(eda): fix missing for SmallCard and DateTime type
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng authored and dovahcrow committed May 16, 2021
1 parent fe515d6 commit 201e487
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 63 deletions.
10 changes: 2 additions & 8 deletions dataprep/eda/eda_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,7 @@
import pandas as pd
import pandas._libs.missing as libmissing

from .dtypes_v2 import (
NUMERICAL_DTYPES,
DType,
DTypeDef,
detect_dtype,
Nominal,
)
from .dtypes_v2 import NUMERICAL_DTYPES, DType, DTypeDef, detect_dtype, Nominal, GeoGraphy

DataFrame = Union[pd.DataFrame, dd.DataFrame, "EDAFrame"]

Expand Down Expand Up @@ -113,7 +107,7 @@ def __init__(

# Transform categorical column to string for non-na values.
for col in ddf.columns:
if isinstance(self._eda_dtypes[col], Nominal):
if isinstance(self._eda_dtypes[col], (Nominal, GeoGraphy)):
ddf[col] = ddf[col].apply(_to_str_if_not_na, meta=(col, "object"))

self._ddf = ddf.persist()
Expand Down
4 changes: 2 additions & 2 deletions dataprep/eda/missing/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def compute_missing(
if x is None and y is not None:
raise ValueError("x cannot be None while y has value")
elif x is not None and y is None:
ret = compute_missing_univariate(eda_frame, x, cfg, dtype)
ret = compute_missing_univariate(eda_frame, x, cfg)
elif x is not None and y is not None:
ret = compute_missing_bivariate(eda_frame, x, y, cfg, dtype)
ret = compute_missing_bivariate(eda_frame, x, y, cfg)
else:
ret = compute_missing_nullivariate(eda_frame, cfg)

Expand Down
37 changes: 25 additions & 12 deletions dataprep/eda/missing/compute/bivariate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""This module implements the plot_missing(df) function's
calculating intermediate part."""

from typing import Any, Generator, List, Optional
from typing import Any, Generator, List

import dask.dataframe as dd
import numpy as np
Expand All @@ -10,32 +10,45 @@

from ...configs import Config
from ...eda_frame import EDAFrame
from ...dtypes_v2 import Continuous, DTypeDef, Nominal, GeoGraphy
from ...dtypes_v2 import Continuous, Nominal, GeoGraphy, SmallCardNum, DateTime
from ...intermediate import ColumnsMetadata, Intermediate
from ...staged import staged
from .common import LABELS, histogram


def _compute_missing_bivariate( # pylint: disable=too-many-locals,too-many-statements
def _compute_missing_bivariate( # pylint: disable=too-many-locals,too-many-statements, too-many-branches
df: EDAFrame,
x: str,
y: str,
cfg: Config,
dtype: Optional[DTypeDef] = None,
) -> Generator[Any, Any, Intermediate]:
"""Calculate the distribution change on another column y when
the missing values in x is dropped."""

xloc, yloc = df.columns.get_loc(x), df.columns.get_loc(y)

col0 = df.values[~df.nulls[:, yloc], yloc].astype(df.dtypes[y])
col1 = df.values[~(df.nulls[:, xloc] | df.nulls[:, yloc]), yloc].astype(df.dtypes[y])

minimum, maximum = col0.min(), col0.max()
y_dtype = df.get_eda_dtype(y)
bins = cfg.bar.bars if isinstance(y_dtype, (Nominal, GeoGraphy)) else cfg.hist.bins
# dataframe with all rows where column x is null removed
ddf = df.frame[~df.frame[x].isna()]
if isinstance(y_dtype, (SmallCardNum, DateTime)):
col0 = df.frame[y].dropna().astype(str).values # series from original dataframe
col1 = ddf[y].dropna().astype(str).values # series with null rows from col x removed
elif isinstance(y_dtype, (GeoGraphy, Nominal, Continuous)):
# Geograph, Nominal should be transformed to str when constructing edaframe.
# Here we do not need to transform them again.
col0 = df.frame[y].dropna().values
col1 = ddf[y].dropna().values
else:
raise ValueError(f"unprocessed type:{y_dtype}")

hists = [histogram(col, bins, return_edges=True, dtype=dtype) for col in [col0, col1]]
minimum, maximum = col0.min(), col0.max()
bins = (
cfg.bar.bars
if isinstance(y_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime))
else cfg.hist.bins
)

hists = [
histogram(col, eda_dtype=y_dtype, bins=bins, return_edges=True) for col in [col0, col1]
]

quantiles = None
if isinstance(y_dtype, Continuous) and cfg.box.enable:
Expand Down
19 changes: 8 additions & 11 deletions dataprep/eda/missing/compute/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,26 @@
import dask.dataframe as dd

from ...configs import Config
from ...dtypes_v2 import Continuous, DTypeDef, Nominal, detect_dtype, GeoGraphy
from ...dtypes_v2 import Continuous, Nominal, GeoGraphy, SmallCardNum, DateTime, DType

LABELS = ["Orignal data", "After drop missing values"]


def uni_histogram(
srs: dd.Series,
srs_dtype: DType,
cfg: Config,
dtype: Optional[DTypeDef] = None,
) -> Tuple[da.Array, ...]:
"""Calculate "histogram" for both numerical and categorical."""

srs_type = detect_dtype(srs, srs.head(), dtype)

if isinstance(srs_type, Continuous):
if isinstance(srs_dtype, Continuous):

counts, edges = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
centers = (edges[:-1] + edges[1:]) / 2

return counts, centers, edges

elif isinstance(srs_type, (Nominal, GeoGraphy)):
elif isinstance(srs_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)):
# Dask array's unique is way slower than the values_counts on Series
# See https://github.com/dask/dask/issues/2851
# centers, counts = da.unique(arr, return_counts=True)
Expand All @@ -43,17 +41,16 @@ def uni_histogram(

def histogram(
arr: da.Array,
eda_dtype: DType,
bins: Optional[int] = None,
return_edges: bool = True,
range: Optional[Tuple[int, int]] = None, # pylint: disable=redefined-builtin
dtype: Optional[DTypeDef] = None,
) -> Tuple[da.Array, ...]:
"""Calculate "histogram" for both numerical and categorical."""
if len(arr.shape) != 1:
raise ValueError("Histogram only supports 1-d array.")
srs = dd.from_dask_array(arr)
detected_type = detect_dtype(srs, srs.head(), dtype)
if isinstance(detected_type, Continuous):
if isinstance(eda_dtype, Continuous):
if range is not None:
minimum, maximum = range
else:
Expand All @@ -68,7 +65,7 @@ def histogram(
if not return_edges:
return counts, centers
return counts, centers, edges
elif isinstance(detected_type, (Nominal, GeoGraphy)):
elif isinstance(eda_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)):
# Dask array's unique is way slower than the values_counts on Series
# See https://github.com/dask/dask/issues/2851
# centers, counts = da.unique(arr, return_counts=True)
Expand All @@ -80,4 +77,4 @@ def histogram(

return (counts, centers)
else:
raise ValueError(f"Unsupported dtype {arr.dtype}")
raise ValueError(f"Unsupported dtype {eda_dtype}")
32 changes: 21 additions & 11 deletions dataprep/eda/missing/compute/univariate.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""This module implements the plot_missing(df, x) function's
calculating intermediate part
"""
from typing import Any, Generator, List, Optional
from typing import Any, Generator, List

import numpy as np
import pandas as pd

from ...configs import Config
from ...eda_frame import EDAFrame
from ...dtypes_v2 import DTypeDef, Continuous, Nominal, GeoGraphy
from ...dtypes_v2 import Continuous, Nominal, GeoGraphy, SmallCardNum, DateTime
from ...intermediate import ColumnsMetadata, Intermediate
from ...staged import staged
from .common import LABELS, uni_histogram
Expand All @@ -18,7 +18,6 @@ def _compute_missing_univariate( # pylint: disable=too-many-locals
df: EDAFrame,
x: str,
cfg: Config,
dtype: Optional[DTypeDef] = None,
) -> Generator[Any, Any, Intermediate]:
"""Calculate the distribution change on other columns when
the missing values in x is dropped."""
Expand All @@ -33,17 +32,26 @@ def _compute_missing_univariate( # pylint: disable=too-many-locals
col_dtype = df.get_eda_dtype(col)
if (
col == x
or isinstance(col_dtype, (Nominal, GeoGraphy))
and not cfg.bar.enable
or isinstance(col_dtype, Continuous)
and not cfg.hist.enable
or (
isinstance(col_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime))
and not cfg.bar.enable
)
or (isinstance(col_dtype, Continuous) and not cfg.hist.enable)
):
continue

srs0 = df.frame[col].dropna() # series from original dataframe
srs1 = ddf[col].dropna() # series with null rows from col x removed
if isinstance(col_dtype, (SmallCardNum, DateTime)):
srs0 = df.frame[col].dropna().astype(str) # series from original dataframe
srs1 = ddf[col].dropna().astype(str) # series with null rows from col x removed
elif isinstance(col_dtype, (GeoGraphy, Nominal, Continuous)):
# Geograph, Nominal should be transformed to str when constructing edaframe.
# Here we do not need to transform them again.
srs0 = df.frame[col].dropna()
srs1 = ddf[col].dropna()
else:
raise ValueError(f"unprocessed type:{col_dtype}")

hists[col] = [uni_histogram(srs, cfg, dtype) for srs in [srs0, srs1]]
hists[col] = [uni_histogram(srs, col_dtype, cfg) for srs in [srs0, srs1]]

### Lazy Region End
hists = yield hists
Expand Down Expand Up @@ -80,7 +88,9 @@ def _compute_missing_univariate( # pylint: disable=too-many-locals
# If the cardinality of a categorical column is too large,
# we show the top `num_bins` values, sorted by their count before drop
col_dtype = df.get_eda_dtype(col_name)
if len(counts[0]) > cfg.bar.bars and (isinstance(col_dtype, (Nominal, GeoGraphy))):
if len(counts[0]) > cfg.bar.bars and (
isinstance(col_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime))
):
sortidx = np.argsort(-counts[0])
selected_xs = xs[0][sortidx[: cfg.bar.bars]]
ret_df = ret_df[ret_df["x"].isin(selected_xs)]
Expand Down
37 changes: 27 additions & 10 deletions dataprep/eda/missing/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from ...errors import UnreachableError
from ..configs import Config
from ..dtypes_v2 import Continuous, Nominal, GeoGraphy, drop_null, is_dtype
from ..dtypes_v2 import Continuous, Nominal, GeoGraphy, SmallCardNum, drop_null, DateTime
from ..intermediate import ColumnMetadata, Intermediate
from ..palette import CATEGORY10, CATEGORY20, GREYS256, RDBU
from ..utils import cut_long_name, fuse_missing_perc, relocate_legend
Expand Down Expand Up @@ -109,7 +109,7 @@ def render_dist(
return fig


def render_hist( # pylint: disable=too-many-arguments
def render_hist(
df: pd.DataFrame,
x: str,
meta: ColumnMetadata,
Expand All @@ -120,13 +120,16 @@ def render_hist( # pylint: disable=too-many-arguments
"""
Render a histogram
"""
if is_dtype(meta["dtype"], Nominal()) or is_dtype(meta["dtype"], GeoGraphy()):
# pylint: disable=too-many-arguments
# pylint: disable=too-many-locals

if isinstance(meta["dtype"], (Nominal, GeoGraphy, SmallCardNum, DateTime)):
tooltips = [
(x, "@x"),
("Count", "@count"),
("Label", "@label"),
]
else:
elif isinstance(meta["dtype"], Continuous):
df = df.copy()
df["repr"] = [f"[{row.lower_bound:.0f}~{row.upper_bound:.0f})" for row in df.itertuples()]

Expand All @@ -135,21 +138,28 @@ def render_hist( # pylint: disable=too-many-arguments
("Frequency", "@count"),
("Label", "@label"),
]
else:
mtype = type(meta["dtype"])
raise ValueError(f"unprocessed data type:{mtype}, col:{x}")

cols = [f"{col[:12]}..." if isinstance(col, str) and len(col) > 18 else col for col in df["x"]]
df["x"] = cols
cmapper = CategoricalColorMapper(palette=CATEGORY10, factors=LABELS)

if is_dtype(meta["dtype"], Nominal()) or is_dtype(meta["dtype"], GeoGraphy()):
if isinstance(meta["dtype"], (Nominal, GeoGraphy, SmallCardNum, DateTime)):
radius = 0.99

# Inputs of FactorRange() have to be sequence of strings,
# object only contains numbers can cause errors.(Issue#98).
df["x"] = df["x"].astype("str")
x_range = FactorRange(*df["x"].unique())
else:
elif isinstance(meta["dtype"], Continuous):

radius = df["x"][1] - df["x"][0]
x_range = Range1d(df["x"].min() - radius, df["x"].max() + radius)
else:
mtype = type(meta["dtype"])
raise ValueError(f"unprocessed data type:{mtype}, col:{x}")

y_range = Range1d(0, df["count"].max() * 1.05)

Expand Down Expand Up @@ -608,6 +618,7 @@ def render_missing_impact_1vn(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
"""
Render the plot from `plot_missing(df, "x")`
"""
# pylint: disable = too-many-locals
plot_width = cfg.plot.width if cfg.plot.width is not None else 300
plot_height = cfg.plot.height if cfg.plot.height is not None else 300

Expand All @@ -623,10 +634,13 @@ def render_missing_impact_1vn(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
fig.frame_height = plot_height
panels.append(Panel(child=fig, title=col))

if is_dtype(meta[col]["dtype"], Nominal()) or is_dtype(meta[col]["dtype"], GeoGraphy()):
if isinstance(meta[col]["dtype"], (Nominal, GeoGraphy, SmallCardNum, DateTime)):
htgs[title] = cfg.bar.grid_how_to_guide()
else:
elif isinstance(meta[col]["dtype"], Continuous):
htgs[title] = cfg.hist.grid_how_to_guide()
else:
mtype = type(meta[col]["dtype"])
raise ValueError(f"unprocessed type:{mtype}")
titles.append(title)
legend_colors = [CATEGORY10[count] for count in range(len(LABELS))]
return {
Expand All @@ -650,7 +664,7 @@ def render_missing_impact_1v1(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
x, y, meta = itmdt["x"], itmdt["y"], itmdt["meta"]
htgs: Dict[str, List[Tuple[str, str]]] = {}

if is_dtype(meta["dtype"], Continuous()):
if isinstance(meta["dtype"], Continuous):
panels = []

if cfg.hist.enable:
Expand Down Expand Up @@ -679,7 +693,7 @@ def render_missing_impact_1v1(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
"container_width": max([panel.child.plot_width for panel in panels]),
"how_to_guide": htgs,
}
else:
elif isinstance(meta["dtype"], (Nominal, SmallCardNum, GeoGraphy, DateTime)):
fig = render_hist(itmdt["hist"], y, meta, plot_width, plot_height, True)
shown, total = meta["shown"], meta["total"]
if shown != total:
Expand All @@ -693,3 +707,6 @@ def render_missing_impact_1v1(itmdt: Intermediate, cfg: Config) -> Dict[str, Any
"container_width": fig.plot_width,
"how_to_guide": htgs,
}
else:
mtype = type(meta["dtype"])
raise ValueError(f"unsupported type:{mtype}")
36 changes: 28 additions & 8 deletions dataprep/tests/eda/random_data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,14 +190,34 @@ def gen_random_dataframe(
return df


@pytest.fixture(scope="module") # type: ignore
def random_df() -> pd.DataFrame:
df1 = gen_random_dataframe(nrows=30, ncols=10, random_state=0).reset_index(drop=True)
df2 = gen_random_dataframe(nrows=30, ncols=10, na_ratio=0.1, random_state=1).reset_index(
drop=True
def gen_test_df() -> pd.DataFrame:
rand = np.random.RandomState(0)
nrows = 30
data = {}
data[0] = gen_random_dataframe(nrows=nrows, ncols=10, random_state=rand).reset_index(drop=True)
data[1] = gen_random_dataframe(
nrows=nrows, ncols=10, na_ratio=0.1, random_state=rand
).reset_index(drop=True)
data[2] = pd.Series([np.nan] * nrows, name="const_na")
data[3] = pd.Series(["s"] * nrows, name="const_str")
data[4] = pd.Series([0] * nrows, name="const_zero")
data[5] = pd.Series([-1] * nrows, name="const_neg")
data[6] = pd.Series([1] * nrows, name="const_pos")
data[7] = pd.Series([0, 1, np.nan] * (nrows // 3), name="small_distinct_miss")
data[8] = gen_random_series(size=nrows, dtype="string", random_state=rand).rename("str_no_miss")
data[9] = gen_random_series(size=nrows, dtype="string", na_ratio=0.1, random_state=rand).rename(
"str_miss"
)
df3 = gen_constant_series(30, np.nan).to_frame().reset_index(drop=True)
df4 = gen_constant_series(30, "s").to_frame().reset_index(drop=True)
df = pd.concat([df1, df2, df3, df4], axis=1)
data[10] = gen_random_series(size=nrows, dtype="float", random_state=rand).rename("num_no_miss")
data[11] = gen_random_series(size=nrows, dtype="float", na_ratio=0.1, random_state=rand).rename(
"num_miss"
)

df = pd.concat(data.values(), axis=1)
df.index = gen_random_series(df.index.shape[0], na_ratio=0.1, str_max_len=100, random_state=2)
return df


@pytest.fixture(scope="module") # type: ignore
def random_df() -> pd.DataFrame:
return gen_test_df()
Loading

0 comments on commit 201e487

Please sign in to comment.