From cd9aa243b6d9e2a8faceaa9379c4872ff7e27488 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 2 Jun 2019 19:34:42 +0200 Subject: [PATCH 1/4] Better TypeError for wrong dtype in str.cat --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/strings.py | 56 +++++++++++++++++++++++---------- pandas/tests/test_strings.py | 16 ++++++++++ 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1619ba1a45739..3e8383c66abc2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -575,7 +575,7 @@ Strings ^^^^^^^ - Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) -- +- Improved error message when passing ``Series`` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`) - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index bd756491abd2f..218577016fd0e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2280,6 +2280,23 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 'must all be of the same length as the ' 'calling Series/Index.') + # data has already been checked by _validate to be of correct dtype, + # but others could still have Series of dtypes (e.g. integers) which + # will necessarily fail in concatenation. To avoid deep and confusing + # traces, we raise here for anything that's not object or all-NA float. + def _legal_dtype(series): + # unify dtype handling between categorical/non-categorical + dtype = (series.dtype if not is_categorical_dtype(series) + else series.cat.categories.dtype) + legal = dtype == 'O' or (dtype == 'float' and series.isna().all()) + return legal + err_wrong_dtype = ('Can only concatenate list-likes containing only ' + 'strings (or missing values).') + if any(not _legal_dtype(x) for x in others): + raise TypeError(err_wrong_dtype + ' Received list-like of dtype: ' + '{}'.format([x.dtype for x in others + if not _legal_dtype(x)][0])) + if join is None and warn: warnings.warn("A future version of pandas will perform index " "alignment when `others` is a Series/Index/" @@ -2307,23 +2324,28 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): na_masks = np.array([isna(x) for x in all_cols]) union_mask = np.logical_or.reduce(na_masks, axis=0) - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_core([x[not_masked] for x in all_cols], - sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [np.where(nm, na_rep, col) - for nm, col in zip(na_masks, all_cols)] - result = cat_core(all_cols, sep) - else: - # no NaNs - can just concatenate - result = cat_core(all_cols, sep) + # if there are any non-string, non-null values hidden within an object + # dtype, cat_core will fail; catch error and return with better message + try: + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_core([x[not_masked] + for x in all_cols], sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [np.where(nm, na_rep, col) + for nm, col in zip(na_masks, all_cols)] + result = cat_core(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_core(all_cols, sep) + except TypeError: + raise TypeError(err_wrong_dtype) if isinstance(self._orig, Index): # add dtype for case that result is all-NA diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1ba0ef3918fb7..5011cead017c1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -420,6 +420,22 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): result = s.str.cat(t, sep=sep) assert_series_or_index_equal(result, expected) + # test integer/float dtypes (inferred by constructor) and mixed + @pytest.mark.parametrize('data', [[1, 2, 3], [.1, .2, .3], [1, 2, 'b']], + ids=['integers', 'floats', 'mixed']) + # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b'] + @pytest.mark.parametrize('box', [Series, Index, list, + lambda x: np.array(x, dtype=object)], + ids=['Series', 'Index', 'list', 'np.array']) + def test_str_cat_wrong_dtype_raises(self, box, data): + # GH 22722 + s = Series(['a', 'b', 'c']) + t = box(data) + + msg = 'Can only concatenate list-likes containing only strings.*' + with pytest.raises(TypeError, match=msg): + s.str.cat(t, join='left') + @pytest.mark.parametrize('box', [Series, Index]) def test_str_cat_mixed_inputs(self, box): s = Index(['a', 'b', 'c', 'd']) From fd710dea08469b672b7e751d8ec0ceb3a4f2c060 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 10 Jun 2019 16:17:48 +0200 Subject: [PATCH 2/4] Review (jreback) --- pandas/core/strings.py | 77 ++++++++++++++++++------------------ pandas/tests/test_strings.py | 5 ++- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 218577016fd0e..d7f118f38cf0a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -53,6 +53,27 @@ def cat_core(list_of_columns, sep): return np.sum(list_with_sep, axis=0) +def cat_safe(list_of_columns, sep): + """ + Auxiliary function for :meth:`str.cat`. + + Same signature as cat_core, but handles TypeErrors in concatenation, which + happen if the Series in list_of columns have the wrong dtypes or content. + """ + # if there are any non-string values (wrong dtype or hidden behind object + # dtype), np.sum will fail; catch error and return with better message + try: + result = cat_core(list_of_columns, sep) + except TypeError: + dtypes = [lib.infer_dtype(x, skipna=True) for x in list_of_columns] + illegal = [x not in ('string', 'empty') for x in dtypes] + first_offender = [x for x, y in zip(list_of_columns, illegal) if y][0] + raise TypeError('Concatenation requires list-likes containing only ' + 'strings (or missing values). Offending values found ' + 'in column {}'.format(first_offender)) from None + return result + + def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) @@ -2280,23 +2301,6 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 'must all be of the same length as the ' 'calling Series/Index.') - # data has already been checked by _validate to be of correct dtype, - # but others could still have Series of dtypes (e.g. integers) which - # will necessarily fail in concatenation. To avoid deep and confusing - # traces, we raise here for anything that's not object or all-NA float. - def _legal_dtype(series): - # unify dtype handling between categorical/non-categorical - dtype = (series.dtype if not is_categorical_dtype(series) - else series.cat.categories.dtype) - legal = dtype == 'O' or (dtype == 'float' and series.isna().all()) - return legal - err_wrong_dtype = ('Can only concatenate list-likes containing only ' - 'strings (or missing values).') - if any(not _legal_dtype(x) for x in others): - raise TypeError(err_wrong_dtype + ' Received list-like of dtype: ' - '{}'.format([x.dtype for x in others - if not _legal_dtype(x)][0])) - if join is None and warn: warnings.warn("A future version of pandas will perform index " "alignment when `others` is a Series/Index/" @@ -2324,28 +2328,23 @@ def _legal_dtype(series): na_masks = np.array([isna(x) for x in all_cols]) union_mask = np.logical_or.reduce(na_masks, axis=0) - # if there are any non-string, non-null values hidden within an object - # dtype, cat_core will fail; catch error and return with better message - try: - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_core([x[not_masked] - for x in all_cols], sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [np.where(nm, na_rep, col) - for nm, col in zip(na_masks, all_cols)] - result = cat_core(all_cols, sep) - else: - # no NaNs - can just concatenate - result = cat_core(all_cols, sep) - except TypeError: - raise TypeError(err_wrong_dtype) + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], + sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [np.where(nm, na_rep, col) + for nm, col in zip(na_masks, all_cols)] + result = cat_safe(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_safe(all_cols, sep) if isinstance(self._orig, Index): # add dtype for case that result is all-NA diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 749ce1da1c1bb..955554f60aa1f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -440,9 +440,10 @@ def test_str_cat_wrong_dtype_raises(self, box, data): s = Series(['a', 'b', 'c']) t = box(data) - msg = 'Can only concatenate list-likes containing only strings.*' + msg = 'Concatenation requires list-likes containing only strings.*' with pytest.raises(TypeError, match=msg): - s.str.cat(t, join='left') + # need to use outer and na_rep, as otherwise Index would not raise + s.str.cat(t, join='outer', na_rep='-') @pytest.mark.parametrize('box', [Series, Index]) def test_str_cat_mixed_inputs(self, box): From 02f6429662ea83b481a2156549bff482c3fcf24a Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 12 Jun 2019 22:54:30 +0200 Subject: [PATCH 3/4] Review (jreback & simonjayhawkins) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/strings.py | 32 +++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 7f0e50525e61b..2021a553dd69b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -607,7 +607,7 @@ Strings ^^^^^^^ - Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) -- Improved error message when passing ``Series`` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`) +- Improved error message when passing :class:`Series` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`) - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index d7f118f38cf0a..4c767a6c4b2e4 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -58,19 +58,33 @@ def cat_safe(list_of_columns, sep): Auxiliary function for :meth:`str.cat`. Same signature as cat_core, but handles TypeErrors in concatenation, which - happen if the Series in list_of columns have the wrong dtypes or content. + happen if the arrays in list_of columns have the wrong dtypes or content. + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep """ - # if there are any non-string values (wrong dtype or hidden behind object - # dtype), np.sum will fail; catch error and return with better message try: result = cat_core(list_of_columns, sep) except TypeError: - dtypes = [lib.infer_dtype(x, skipna=True) for x in list_of_columns] - illegal = [x not in ('string', 'empty') for x in dtypes] - first_offender = [x for x, y in zip(list_of_columns, illegal) if y][0] - raise TypeError('Concatenation requires list-likes containing only ' - 'strings (or missing values). Offending values found ' - 'in column {}'.format(first_offender)) from None + # if there are any non-string values (wrong dtype or hidden behind + # object dtype), np.sum will fail; catch and return with better message + for column in list_of_columns: + dtype = lib.infer_dtype(column, skipna=True) + if dtype not in ['string', 'empty']: + raise TypeError( + 'Concatenation requires list-likes containing only ' + 'strings (or missing values). Offending values found in ' + 'column {}'.format(dtype)) from None return result From 9752aa7311ef7f9d80384d63d3a9b58ba00ab362 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 13 Jun 2019 22:00:59 +0200 Subject: [PATCH 4/4] Add typing to cat_core/cat_safe (review jreback) --- pandas/core/strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index fcac17e8bfdbe..413c0e73f8410 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import Dict +from typing import Dict, List import warnings import numpy as np @@ -31,7 +31,7 @@ _shared_docs = dict() # type: Dict[str, str] -def cat_core(list_of_columns, sep): +def cat_core(list_of_columns: List, sep: str): """ Auxiliary function for :meth:`str.cat` @@ -53,7 +53,7 @@ def cat_core(list_of_columns, sep): return np.sum(list_with_sep, axis=0) -def cat_safe(list_of_columns, sep): +def cat_safe(list_of_columns: List, sep: str): """ Auxiliary function for :meth:`str.cat`.