fix: Histogram chart not able to use decimal datatype column (#30416)

apache · Sep 30, 2024 · 4834390 · 4834390
1 parent bdd50c7
commit 4834390
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 24 deletions.
diff --git a/superset-frontend/plugins/plugin-chart-echarts/src/Histogram/buildQuery.ts b/superset-frontend/plugins/plugin-chart-echarts/src/Histogram/buildQuery.ts
@@ -25,7 +25,6 @@ export default function buildQuery(formData: HistogramFormData) {
   return buildQueryContext(formData, baseQueryObject => [
     {
       ...baseQueryObject,
-      extras: { where: `${column} IS NOT NULL` },
       columns: [...groupby, column],
       post_processing: [histogramOperator(formData, baseQueryObject)],
       metrics: undefined,

diff --git a/superset/utils/pandas_postprocessing/histogram.py b/superset/utils/pandas_postprocessing/histogram.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 import numpy as np
-from pandas import DataFrame, Series
+from pandas import DataFrame, Series, to_numeric
 
 
 # pylint: disable=too-many-arguments
@@ -48,12 +48,15 @@ def histogram(
     if groupby is None:
         groupby = []
 
-    # check if the column is numeric
-    if not np.issubdtype(df[column].dtype, np.number):
-        raise ValueError(f"The column '{column}' must be numeric.")
+    # convert to numeric, coercing errors to NaN
+    df[column] = to_numeric(df[column], errors="coerce")
+
+    # check if the column contains non-numeric values
+    if df[column].isna().any():
+        raise ValueError(f"Column '{column}' contains non-numeric values")
 
     # calculate the histogram bin edges
-    bin_edges = np.histogram_bin_edges(df[column].dropna(), bins=bins)
+    bin_edges = np.histogram_bin_edges(df[column], bins=bins)
 
     # convert the bin edges to strings
     bin_edges_str = [
@@ -62,6 +65,7 @@ def histogram(
     ]
 
     def hist_values(series: Series) -> np.ndarray:
+        # we might have NaN values as the result of grouping so we need to drop them
         result = np.histogram(series.dropna(), bins=bin_edges)[0]
         return result if not cumulative else np.cumsum(result)
 

diff --git a/tests/unit_tests/pandas_postprocessing/test_histogram.py b/tests/unit_tests/pandas_postprocessing/test_histogram.py
@@ -117,28 +117,20 @@ def test_histogram_with_groupby_and_cumulative_and_normalize():
 
 def test_histogram_with_non_numeric_column():
     try:
-        histogram(data, "b", ["group"], bins)
+        histogram(data, "group", None, bins)
     except ValueError as e:
-        assert str(e) == "The column 'b' must be numeric."
+        assert str(e) == "Column 'group' contains non-numeric values"
 
 
-# test histogram ignore null values
-def test_histogram_ignore_null_values():
-    data_with_null = DataFrame(
+def test_histogram_with_some_non_numeric_values():
+    data_with_non_numeric = DataFrame(
         {
             "group": ["A", "A", "B", "B", "A", "A", "B", "B", "A", "A"],
-            "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, None],
-            "b": [1, 2, 3, 4, 5, 6, 7, 8, 9, None],
+            "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, "10"],
+            "b": [1, 2, 3, 4, 5, 6, 7, 8, 9, "10"],
         }
     )
-    result = histogram(data_with_null, "a", ["group"], bins)
-    assert result.shape == (2, bins + 1)
-    assert result.columns.tolist() == [
-        "group",
-        "1 - 2",
-        "2 - 4",
-        "4 - 5",
-        "5 - 7",
-        "7 - 9",
-    ]
-    assert result.values.tolist() == [["A", 2, 0, 1, 1, 1], ["B", 0, 2, 0, 1, 1]]
+    try:
+        histogram(data_with_non_numeric, "a", ["group"], bins)
+    except ValueError as e:
+        assert str(e) == "Column 'group' contains non-numeric values"