Skip to content

Commit

Permalink
refactor(eda): use EDAFrame in plot and change API col names
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed May 19, 2021
1 parent c95a7ff commit 95074f5
Show file tree
Hide file tree
Showing 18 changed files with 383 additions and 299 deletions.
10 changes: 5 additions & 5 deletions dataprep/eda/correlation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

def plot_correlation(
df: Union[pd.DataFrame, dd.DataFrame],
x: Optional[str] = None,
y: Optional[str] = None,
col1: Optional[str] = None,
col2: Optional[str] = None,
*,
value_range: Optional[Tuple[float, float]] = None,
k: Optional[int] = None,
Expand All @@ -37,9 +37,9 @@ def plot_correlation(
----------
df
The pandas data_frame for which plots are calculated for each column.
x
col1
A valid column name of the data frame.
y
col2
A valid column name of the data frame.
value_range
Range of value.
Expand Down Expand Up @@ -76,7 +76,7 @@ def plot_correlation(
cfg = Config.from_dict(display, config)

with ProgressBar(minimum=1, disable=not progress):
itmdt = compute_correlation(df, x, y, cfg=cfg, value_range=value_range, k=k)
itmdt = compute_correlation(df, col1, col2, cfg=cfg, value_range=value_range, k=k)
to_render = render_correlation(itmdt, cfg)

return Container(to_render, itmdt.visual_type, cfg)
9 changes: 5 additions & 4 deletions dataprep/eda/correlation/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

def compute_correlation(
df: DataFrame,
x: Optional[str] = None,
y: Optional[str] = None,
col1: Optional[str] = None,
col2: Optional[str] = None,
*,
cfg: Union[Config, Dict[str, Any], None] = None,
display: Optional[List[str]] = None,
Expand All @@ -32,9 +32,9 @@ def compute_correlation(
The pandas dataframe for which plots are calculated for each column.
cfg
Config instance
x
col1
A valid column name of the dataframe
y
col2
A valid column name of the dataframe
value_range
If the correlation value is out of the range, don't show it.
Expand All @@ -55,6 +55,7 @@ def compute_correlation(
elif not cfg:
cfg = Config()

x, y = col1, col2
frame = EDAFrame(df)
if x is None and y is None: # pylint: disable=no-else-return
with catch_warnings():
Expand Down
2 changes: 1 addition & 1 deletion dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def basic_computations(
data[col]["line"] = dask.delayed(_calc_line_dt)(df.frame[[col]], "auto")
# overview
if cfg.overview.enable:
data["ov"] = calc_stats(df.frame, cfg, None)
data["ov"] = calc_stats(df, cfg)
data["insights"] = []
for col in df.columns:
col_dtype = df.get_eda_dtype(col)
Expand Down
16 changes: 8 additions & 8 deletions dataprep/eda/distribution/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from ..configs import Config
from ..container import Container
from ..dtypes import DTypeDef, LatLong
from ..dtypes_v2 import DTypeDef, LatLong
from ...progress_bar import ProgressBar
from .compute import compute
from .render import render
Expand All @@ -19,9 +19,9 @@

def plot(
df: Union[pd.DataFrame, dd.DataFrame],
x: Optional[Union[str, LatLong]] = None,
y: Optional[Union[str, LatLong]] = None,
z: Optional[str] = None,
col1: Optional[Union[str, LatLong]] = None,
col2: Optional[Union[str, LatLong]] = None,
col3: Optional[str] = None,
*,
config: Optional[Dict[str, Any]] = None,
display: Optional[List[str]] = None,
Expand Down Expand Up @@ -66,11 +66,11 @@ def plot(
----------
df
DataFrame from which visualizations are generated
x: Optional[str], default None
col1: Optional[str], default None
A valid column name from the dataframe
y: Optional[str], default None
col2: Optional[str], default None
A valid column name from the dataframe
z: Optional[str], default None
col3: Optional[str], default None
A valid column name from the dataframe
config
A dictionary for configuring the visualizations
Expand Down Expand Up @@ -98,7 +98,7 @@ def plot(
cfg = Config.from_dict(display, config)

with ProgressBar(minimum=1, disable=not progress):
itmdt = compute(df, x, y, z, cfg=cfg, dtype=dtype)
itmdt = compute(df, col1, col2, col3, cfg=cfg, dtype=dtype)

to_render = render(itmdt, cfg)

Expand Down
81 changes: 25 additions & 56 deletions dataprep/eda/distribution/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
import pandas as pd

from ...configs import Config
from ...dtypes import DTypeDef, is_dtype, GeoPoint, LatLong
from ...dtypes_v2 import DTypeDef, LatLong
from ...intermediate import Intermediate
from ...utils import preprocess_dataframe
from .bivariate import compute_bivariate
from .overview import compute_overview
from .trivariate import compute_trivariate
Expand All @@ -22,9 +21,9 @@

def compute(
df: Union[pd.DataFrame, dd.DataFrame],
x: Optional[Union[str, LatLong]] = None,
y: Optional[Union[str, LatLong]] = None,
z: Optional[str] = None,
col1: Optional[Union[str, LatLong]] = None,
col2: Optional[Union[str, LatLong]] = None,
col3: Optional[str] = None,
*,
cfg: Union[Config, Dict[str, Any], None] = None,
display: Optional[List[str]] = None,
Expand All @@ -45,11 +44,11 @@ def compute(
display: Optional[List[str]], default None
A list containing the names of the visualizations to display. Only exist when
a user call compute() directly and want to customize the output
x: Optional[str], default None
col1: Optional[str], default None
A valid column name from the dataframe
y: Optional[str], default None
col2: Optional[str], default None
A valid column name from the dataframe
z: Optional[str], default None
col3: Optional[str], default None
A valid column name from the dataframe
dtype: str or DType or dict of str or dict of DType, default None
Specify Data Types for designated column or all columns.
Expand All @@ -61,30 +60,35 @@ def compute(

suppress_warnings()

params, exlude, ddf = process_latlong(df, x, y, z)
ddf = preprocess_dataframe(ddf, excluded_columns=exlude)

if isinstance(cfg, dict):
cfg = Config.from_dict(display, cfg)

elif not cfg:
cfg = Config()

if not any(params):
return compute_overview(ddf, cfg, dtype)
x, y, z = col1, col2, col3

if not any([x, y, z]):
return compute_overview(df, cfg, dtype)

if sum(v is None for v in params) == 2:
x = params[0] or params[1] or params[2]
return compute_univariate(ddf, x, cfg, dtype)
if sum(v is None for v in (x, y, z)) == 2:
x = x or y or z
if x is None:
raise ValueError
return compute_univariate(df, x, cfg, dtype)

if sum(v is None for v in params) == 1:
x, y = (v for v in params if v is not None)
return compute_bivariate(ddf, x, y, cfg, dtype)
if sum(v is None for v in [x, y, z]) == 1:
x, y = (v for v in [x, y, z] if v is not None)
if x is None or y is None:
raise ValueError
return compute_bivariate(df, x, y, cfg, dtype)

if x is not None and y is not None and z is not None:
return compute_trivariate(ddf, x, y, z, cfg, dtype)
if not (isinstance(x, str) and isinstance(y, str) and isinstance(z, str)):
raise TypeError("Column names should be string. Current column names: {x}, {y}, {z}")
return compute_trivariate(df, x, y, z, cfg, dtype)

raise ValueError("not possible")
raise ValueError("The input is not correct.")


def suppress_warnings() -> None:
Expand All @@ -96,38 +100,3 @@ def suppress_warnings() -> None:
"The default value of regex will change from True to False in a future version",
category=FutureWarning,
)


def concat_latlong(df: Union[pd.DataFrame, dd.DataFrame], x: Any) -> Tuple[str, Any]:
"""
Merge Latlong into one new column.
"""

name = x.lat + "_&_" + x.long
lat_long = tuple(zip(df[x.lat], df[x.long]))

return name, lat_long


def process_latlong(
df: pd.DataFrame,
x: Optional[Union[str, LatLong]] = None,
y: Optional[Union[str, LatLong]] = None,
z: Optional[str] = None,
) -> Tuple[List[Optional[Union[str, LatLong]]], List[str], pd.DataFrame]:
"""
Process Latlong data tpye.
"""

params = []
exclude: List[str] = []
add_df = df.copy()
for temp in (x, y, z):
name = temp
if isinstance(temp, GeoPoint):
name, lat_long = concat_latlong(df, temp)
add_df[name] = lat_long
exclude.append(name)
params.append(name)

return params, exclude, add_df
Loading

0 comments on commit 95074f5

Please sign in to comment.