diff --git a/superset-frontend/plugins/plugin-chart-echarts/src/Histogram/buildQuery.ts b/superset-frontend/plugins/plugin-chart-echarts/src/Histogram/buildQuery.ts index aed4492bd784e..4afcb1e4af3ce 100644 --- a/superset-frontend/plugins/plugin-chart-echarts/src/Histogram/buildQuery.ts +++ b/superset-frontend/plugins/plugin-chart-echarts/src/Histogram/buildQuery.ts @@ -25,7 +25,6 @@ export default function buildQuery(formData: HistogramFormData) { return buildQueryContext(formData, baseQueryObject => [ { ...baseQueryObject, - extras: { where: `${column} IS NOT NULL` }, columns: [...groupby, column], post_processing: [histogramOperator(formData, baseQueryObject)], metrics: undefined, diff --git a/superset/utils/pandas_postprocessing/histogram.py b/superset/utils/pandas_postprocessing/histogram.py index d91e129e8c970..dbe93ef32b158 100644 --- a/superset/utils/pandas_postprocessing/histogram.py +++ b/superset/utils/pandas_postprocessing/histogram.py @@ -17,7 +17,7 @@ from __future__ import annotations import numpy as np -from pandas import DataFrame, Series +from pandas import DataFrame, Series, to_numeric # pylint: disable=too-many-arguments @@ -48,12 +48,15 @@ def histogram( if groupby is None: groupby = [] - # check if the column is numeric - if not np.issubdtype(df[column].dtype, np.number): - raise ValueError(f"The column '{column}' must be numeric.") + # convert to numeric, coercing errors to NaN + df[column] = to_numeric(df[column], errors="coerce") + + # check if the column contains non-numeric values + if df[column].isna().any(): + raise ValueError(f"Column '{column}' contains non-numeric values") # calculate the histogram bin edges - bin_edges = np.histogram_bin_edges(df[column].dropna(), bins=bins) + bin_edges = np.histogram_bin_edges(df[column], bins=bins) # convert the bin edges to strings bin_edges_str = [ @@ -62,6 +65,7 @@ def histogram( ] def hist_values(series: Series) -> np.ndarray: + # we might have NaN values as the result of grouping so we need to drop them result = np.histogram(series.dropna(), bins=bin_edges)[0] return result if not cumulative else np.cumsum(result) diff --git a/tests/unit_tests/pandas_postprocessing/test_histogram.py b/tests/unit_tests/pandas_postprocessing/test_histogram.py index 6ea4c34f57f6c..73370c8e62fdb 100644 --- a/tests/unit_tests/pandas_postprocessing/test_histogram.py +++ b/tests/unit_tests/pandas_postprocessing/test_histogram.py @@ -117,28 +117,20 @@ def test_histogram_with_groupby_and_cumulative_and_normalize(): def test_histogram_with_non_numeric_column(): try: - histogram(data, "b", ["group"], bins) + histogram(data, "group", None, bins) except ValueError as e: - assert str(e) == "The column 'b' must be numeric." + assert str(e) == "Column 'group' contains non-numeric values" -# test histogram ignore null values -def test_histogram_ignore_null_values(): - data_with_null = DataFrame( +def test_histogram_with_some_non_numeric_values(): + data_with_non_numeric = DataFrame( { "group": ["A", "A", "B", "B", "A", "A", "B", "B", "A", "A"], - "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, None], - "b": [1, 2, 3, 4, 5, 6, 7, 8, 9, None], + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, "10"], + "b": [1, 2, 3, 4, 5, 6, 7, 8, 9, "10"], } ) - result = histogram(data_with_null, "a", ["group"], bins) - assert result.shape == (2, bins + 1) - assert result.columns.tolist() == [ - "group", - "1 - 2", - "2 - 4", - "4 - 5", - "5 - 7", - "7 - 9", - ] - assert result.values.tolist() == [["A", 2, 0, 1, 1, 1], ["B", 0, 2, 0, 1, 1]] + try: + histogram(data_with_non_numeric, "a", ["group"], bins) + except ValueError as e: + assert str(e) == "Column 'group' contains non-numeric values"