fix(eda.create_report): handle unhashable dtypes

sfu-db · Oct 3, 2020 · 7743749 · 7743749
1 parent 2153b74
commit 7743749
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 44 deletions.
diff --git a/dataprep/eda/create_report/formatter.py b/dataprep/eda/create_report/formatter.py
@@ -137,42 +137,45 @@ def format_basic(df: dd.DataFrame) -> Dict[str, Any]:
             "col_type": itmdt.visual_type.replace("_column", ""),
         }
 
-    # interactions
-    res["has_interaction"] = True
-    itmdt = Intermediate(data=data["scat"], visual_type="correlation_crossfilter")
-    rndrd = render_correlation(itmdt)
-    rndrd.sizing_mode = "stretch_width"
-    res["interactions"] = components(rndrd)
-
-    # correlations
-    res["has_correlation"] = True
-    dfs: Dict[str, pd.DataFrame] = {}
-    for method, corr in data["corrs"].items():
-        ndf = pd.DataFrame(
-            {
-                "x": data["num_cols"][data["cordx"]],
-                "y": data["num_cols"][data["cordy"]],
-                "correlation": corr.ravel(),
-            }
+    if len(data["num_cols"]) > 0:
+        # interactions
+        res["has_interaction"] = True
+        itmdt = Intermediate(data=data["scat"], visual_type="correlation_crossfilter")
+        rndrd = render_correlation(itmdt)
+        rndrd.sizing_mode = "stretch_width"
+        res["interactions"] = components(rndrd)
+
+        # correlations
+        res["has_correlation"] = True
+        dfs: Dict[str, pd.DataFrame] = {}
+        for method, corr in data["corrs"].items():
+            ndf = pd.DataFrame(
+                {
+                    "x": data["num_cols"][data["cordx"]],
+                    "y": data["num_cols"][data["cordy"]],
+                    "correlation": corr.ravel(),
+                }
+            )
+            dfs[method.name] = ndf[data["cordy"] > data["cordx"]]
+        itmdt = Intermediate(
+            data=dfs,
+            axis_range=list(data["num_cols"]),
+            visual_type="correlation_heatmaps",
         )
-        dfs[method.name] = ndf[data["cordy"] > data["cordx"]]
-    itmdt = Intermediate(
-        data=dfs, axis_range=list(data["num_cols"]), visual_type="correlation_heatmaps",
-    )
-    rndrd = render_correlation(itmdt)
-    figs.clear()
-    for tab in rndrd.tabs:
-        fig = tab.child
-        fig.sizing_mode = "stretch_width"
-        fig.title = Title(text=tab.title, align="center", text_font_size="20px")
-        figs.append(fig)
-    res["correlations"] = components(figs)
+        rndrd = render_correlation(itmdt)
+        figs.clear()
+        for tab in rndrd.tabs:
+            fig = tab.child
+            fig.sizing_mode = "stretch_width"
+            fig.title = Title(text=tab.title, align="center", text_font_size="20px")
+            figs.append(fig)
+        res["correlations"] = components(figs)
+    else:
+        res["has_interaction"], res["has_correlation"] = False, False
 
     # missing
     res["has_missing"] = True
-
     itmdt = completions["miss"](data["miss"])
-
     rndrd = render_missing(itmdt)
     figs.clear()
     for tab in rndrd.tabs:
@@ -200,16 +203,21 @@ def basic_computations(df: dd.DataFrame) -> Tuple[Dict[str, Any], Dict[str, Any]
     data["num_cols"] = df_num.columns
     first_rows = df.select_dtypes(CATEGORICAL_DTYPES).head
 
-    # overview
-    data["ov"] = calc_stats(df.frame, None)
-    # # variables
+    # variables
     for col in df.columns:
         if is_dtype(detect_dtype(df.frame[col]), Continuous()):
             data[col] = cont_comps(df.frame[col], 20)
         elif is_dtype(detect_dtype(df.frame[col]), Nominal()):
+            # cast the column as string type if it contains a mutable type
+            try:
+                first_rows[col].apply(hash)
+            except TypeError:
+                df.frame[col] = df.frame[col].astype(str)
             data[col] = nom_comps(
                 df.frame[col], first_rows[col], 10, True, 10, 20, True, False, False
             )
+    # overview
+    data["ov"] = calc_stats(df.frame, None)
     # interactions
     data["scat"] = df_num.frame.map_partitions(
         lambda x: x.sample(min(1000, x.shape[0])), meta=df_num.frame

diff --git a/dataprep/eda/distribution/compute/overview.py b/dataprep/eda/distribution/compute/overview.py
@@ -241,10 +241,8 @@ def calc_stats(df: dd.DataFrame, dtype: Optional[DTypeDef]) -> Dict[str, Any]:
     ----------
     df
         a DataFrame
-    dtype_cnts
-        a dictionary that contains the count for each type
-    num_cols:
-        numerical columns in the dataset
+    dtype
+        str or DType or dict of str or dict of DType
     """
 
     stats = {"nrows": df.shape[0]}

diff --git a/dataprep/eda/distribution/compute/univariate.py b/dataprep/eda/distribution/compute/univariate.py
@@ -88,6 +88,11 @@ def compute_univariate(
     col_dtype = detect_dtype(df[x], dtype)
     if is_dtype(col_dtype, Nominal()):
         first_rows = df[x].head()  # dd.Series.head() triggers a (small) data read
+        # cast the column as string type if it contains a mutable type
+        try:
+            first_rows.apply(hash)
+        except TypeError:
+            df[x] = df[x].astype(str)
         # all computations for plot(df, Nominal())
         data = nom_comps(
             df[x],
@@ -170,11 +175,6 @@ def nom_comps(
 
     # total rows
     data["nrows"] = srs.shape[0]
-    # cast the column as string type if it contains a mutable type
-    try:
-        first_rows.apply(hash)
-    except TypeError:
-        srs = srs.astype(str)
     # drop null values
     srs = srs.dropna()
 

diff --git a/dataprep/tests/eda/test_create_report.py b/dataprep/tests/eda/test_create_report.py
@@ -17,6 +17,7 @@ def simpledf() -> pd.DataFrame:
     df = pd.concat(
         [df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1
     )
+    df = pd.concat([df, pd.Series([["foo"] * 1000])], axis=1)
     df = pd.concat(
         [
             df,
@@ -29,7 +30,7 @@ def simpledf() -> pd.DataFrame:
         axis=1,
     )
     # df = pd.concat([df, pd.Series(np.zeros(1000))], axis=1)
-    df.columns = ["a", "b", "c", "d", "e"]
+    df.columns = ["a", "b", "c", "d", "e", "f"]
     # df["e"] = pd.to_datetime(df["e"])
 
     idx = np.arange(1000)