refactor(eda): use EDAFrame in plot and change API col names

sfu-db · May 19, 2021 · 95074f5 · 95074f5
1 parent c95a7ff
commit 95074f5
Show file tree

Hide file tree

Showing 18 changed files with 383 additions and 299 deletions.
diff --git a/dataprep/eda/correlation/__init__.py b/dataprep/eda/correlation/__init__.py
@@ -18,8 +18,8 @@
 
 def plot_correlation(
     df: Union[pd.DataFrame, dd.DataFrame],
-    x: Optional[str] = None,
-    y: Optional[str] = None,
+    col1: Optional[str] = None,
+    col2: Optional[str] = None,
     *,
     value_range: Optional[Tuple[float, float]] = None,
     k: Optional[int] = None,
@@ -37,9 +37,9 @@ def plot_correlation(
     ----------
     df
         The pandas data_frame for which plots are calculated for each column.
-    x
+    col1
         A valid column name of the data frame.
-    y
+    col2
         A valid column name of the data frame.
     value_range
         Range of value.
@@ -76,7 +76,7 @@ def plot_correlation(
     cfg = Config.from_dict(display, config)
 
     with ProgressBar(minimum=1, disable=not progress):
-        itmdt = compute_correlation(df, x, y, cfg=cfg, value_range=value_range, k=k)
+        itmdt = compute_correlation(df, col1, col2, cfg=cfg, value_range=value_range, k=k)
     to_render = render_correlation(itmdt, cfg)
 
     return Container(to_render, itmdt.visual_type, cfg)
diff --git a/dataprep/eda/correlation/compute/__init__.py b/dataprep/eda/correlation/compute/__init__.py
@@ -16,8 +16,8 @@
 
 def compute_correlation(
     df: DataFrame,
-    x: Optional[str] = None,
-    y: Optional[str] = None,
+    col1: Optional[str] = None,
+    col2: Optional[str] = None,
     *,
     cfg: Union[Config, Dict[str, Any], None] = None,
     display: Optional[List[str]] = None,
@@ -32,9 +32,9 @@ def compute_correlation(
         The pandas dataframe for which plots are calculated for each column.
     cfg
         Config instance
-    x
+    col1
         A valid column name of the dataframe
-    y
+    col2
         A valid column name of the dataframe
     value_range
         If the correlation value is out of the range, don't show it.
@@ -55,6 +55,7 @@ def compute_correlation(
     elif not cfg:
         cfg = Config()
 
+    x, y = col1, col2
     frame = EDAFrame(df)
     if x is None and y is None:  # pylint: disable=no-else-return
         with catch_warnings():

diff --git a/dataprep/eda/create_report/formatter.py b/dataprep/eda/create_report/formatter.py
@@ -296,7 +296,7 @@ def basic_computations(
                 data[col]["line"] = dask.delayed(_calc_line_dt)(df.frame[[col]], "auto")
     # overview
     if cfg.overview.enable:
-        data["ov"] = calc_stats(df.frame, cfg, None)
+        data["ov"] = calc_stats(df, cfg)
         data["insights"] = []
         for col in df.columns:
             col_dtype = df.get_eda_dtype(col)

diff --git a/dataprep/eda/distribution/__init__.py b/dataprep/eda/distribution/__init__.py
@@ -9,7 +9,7 @@
 
 from ..configs import Config
 from ..container import Container
-from ..dtypes import DTypeDef, LatLong
+from ..dtypes_v2 import DTypeDef, LatLong
 from ...progress_bar import ProgressBar
 from .compute import compute
 from .render import render
@@ -19,9 +19,9 @@
 
 def plot(
     df: Union[pd.DataFrame, dd.DataFrame],
-    x: Optional[Union[str, LatLong]] = None,
-    y: Optional[Union[str, LatLong]] = None,
-    z: Optional[str] = None,
+    col1: Optional[Union[str, LatLong]] = None,
+    col2: Optional[Union[str, LatLong]] = None,
+    col3: Optional[str] = None,
     *,
     config: Optional[Dict[str, Any]] = None,
     display: Optional[List[str]] = None,
@@ -66,11 +66,11 @@ def plot(
     ----------
     df
         DataFrame from which visualizations are generated
-    x: Optional[str], default None
+    col1: Optional[str], default None
         A valid column name from the dataframe
-    y: Optional[str], default None
+    col2: Optional[str], default None
         A valid column name from the dataframe
-    z: Optional[str], default None
+    col3: Optional[str], default None
         A valid column name from the dataframe
     config
         A dictionary for configuring the visualizations
@@ -98,7 +98,7 @@ def plot(
     cfg = Config.from_dict(display, config)
 
     with ProgressBar(minimum=1, disable=not progress):
-        itmdt = compute(df, x, y, z, cfg=cfg, dtype=dtype)
+        itmdt = compute(df, col1, col2, col3, cfg=cfg, dtype=dtype)
 
     to_render = render(itmdt, cfg)
 

diff --git a/dataprep/eda/distribution/compute/__init__.py b/dataprep/eda/distribution/compute/__init__.py
@@ -9,9 +9,8 @@
 import pandas as pd
 
 from ...configs import Config
-from ...dtypes import DTypeDef, is_dtype, GeoPoint, LatLong
+from ...dtypes_v2 import DTypeDef, LatLong
 from ...intermediate import Intermediate
-from ...utils import preprocess_dataframe
 from .bivariate import compute_bivariate
 from .overview import compute_overview
 from .trivariate import compute_trivariate
@@ -22,9 +21,9 @@
 
 def compute(
     df: Union[pd.DataFrame, dd.DataFrame],
-    x: Optional[Union[str, LatLong]] = None,
-    y: Optional[Union[str, LatLong]] = None,
-    z: Optional[str] = None,
+    col1: Optional[Union[str, LatLong]] = None,
+    col2: Optional[Union[str, LatLong]] = None,
+    col3: Optional[str] = None,
     *,
     cfg: Union[Config, Dict[str, Any], None] = None,
     display: Optional[List[str]] = None,
@@ -45,11 +44,11 @@ def compute(
     display: Optional[List[str]], default None
         A list containing the names of the visualizations to display. Only exist when
         a user call compute() directly and want to customize the output
-    x: Optional[str], default None
+    col1: Optional[str], default None
         A valid column name from the dataframe
-    y: Optional[str], default None
+    col2: Optional[str], default None
         A valid column name from the dataframe
-    z: Optional[str], default None
+    col3: Optional[str], default None
         A valid column name from the dataframe
     dtype: str or DType or dict of str or dict of DType, default None
         Specify Data Types for designated column or all columns.
@@ -61,30 +60,35 @@ def compute(
 
     suppress_warnings()
 
-    params, exlude, ddf = process_latlong(df, x, y, z)
-    ddf = preprocess_dataframe(ddf, excluded_columns=exlude)
-
     if isinstance(cfg, dict):
         cfg = Config.from_dict(display, cfg)
 
     elif not cfg:
         cfg = Config()
 
-    if not any(params):
-        return compute_overview(ddf, cfg, dtype)
+    x, y, z = col1, col2, col3
+
+    if not any([x, y, z]):
+        return compute_overview(df, cfg, dtype)
 
-    if sum(v is None for v in params) == 2:
-        x = params[0] or params[1] or params[2]
-        return compute_univariate(ddf, x, cfg, dtype)
+    if sum(v is None for v in (x, y, z)) == 2:
+        x = x or y or z
+        if x is None:
+            raise ValueError
+        return compute_univariate(df, x, cfg, dtype)
 
-    if sum(v is None for v in params) == 1:
-        x, y = (v for v in params if v is not None)
-        return compute_bivariate(ddf, x, y, cfg, dtype)
+    if sum(v is None for v in [x, y, z]) == 1:
+        x, y = (v for v in [x, y, z] if v is not None)
+        if x is None or y is None:
+            raise ValueError
+        return compute_bivariate(df, x, y, cfg, dtype)
 
     if x is not None and y is not None and z is not None:
-        return compute_trivariate(ddf, x, y, z, cfg, dtype)
+        if not (isinstance(x, str) and isinstance(y, str) and isinstance(z, str)):
+            raise TypeError("Column names should be string. Current column names: {x}, {y}, {z}")
+        return compute_trivariate(df, x, y, z, cfg, dtype)
 
-    raise ValueError("not possible")
+    raise ValueError("The input is not correct.")
 
 
 def suppress_warnings() -> None:
@@ -96,38 +100,3 @@ def suppress_warnings() -> None:
         "The default value of regex will change from True to False in a future version",
         category=FutureWarning,
     )
-
-
-def concat_latlong(df: Union[pd.DataFrame, dd.DataFrame], x: Any) -> Tuple[str, Any]:
-    """
-    Merge Latlong into one new column.
-    """
-
-    name = x.lat + "_&_" + x.long
-    lat_long = tuple(zip(df[x.lat], df[x.long]))
-
-    return name, lat_long
-
-
-def process_latlong(
-    df: pd.DataFrame,
-    x: Optional[Union[str, LatLong]] = None,
-    y: Optional[Union[str, LatLong]] = None,
-    z: Optional[str] = None,
-) -> Tuple[List[Optional[Union[str, LatLong]]], List[str], pd.DataFrame]:
-    """
-    Process Latlong data tpye.
-    """
-
-    params = []
-    exclude: List[str] = []
-    add_df = df.copy()
-    for temp in (x, y, z):
-        name = temp
-        if isinstance(temp, GeoPoint):
-            name, lat_long = concat_latlong(df, temp)
-            add_df[name] = lat_long
-            exclude.append(name)
-        params.append(name)
-
-    return params, exclude, add_df