Skip to content

Commit

Permalink
Better error for str.cat with listlike of wrong dtype. (#26607)
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari authored and jreback committed Jun 14, 2019
1 parent a6f11ac commit 5d0ff69
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 6 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ Strings
^^^^^^^

- Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`)
-
- Improved error message when passing :class:`Series` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`)
-


Expand Down
45 changes: 40 additions & 5 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import wraps
import re
import textwrap
from typing import Dict
from typing import Dict, List
import warnings

import numpy as np
Expand Down Expand Up @@ -31,7 +31,7 @@
_shared_docs = dict() # type: Dict[str, str]


def cat_core(list_of_columns, sep):
def cat_core(list_of_columns: List, sep: str):
"""
Auxiliary function for :meth:`str.cat`
Expand All @@ -53,6 +53,41 @@ def cat_core(list_of_columns, sep):
return np.sum(list_with_sep, axis=0)


def cat_safe(list_of_columns: List, sep: str):
"""
Auxiliary function for :meth:`str.cat`.
Same signature as cat_core, but handles TypeErrors in concatenation, which
happen if the arrays in list_of columns have the wrong dtypes or content.
Parameters
----------
list_of_columns : list of numpy arrays
List of arrays to be concatenated with sep;
these arrays may not contain NaNs!
sep : string
The separator string for concatenating the columns
Returns
-------
nd.array
The concatenation of list_of_columns with sep
"""
try:
result = cat_core(list_of_columns, sep)
except TypeError:
# if there are any non-string values (wrong dtype or hidden behind
# object dtype), np.sum will fail; catch and return with better message
for column in list_of_columns:
dtype = lib.infer_dtype(column, skipna=True)
if dtype not in ['string', 'empty']:
raise TypeError(
'Concatenation requires list-likes containing only '
'strings (or missing values). Offending values found in '
'column {}'.format(dtype)) from None
return result


def _na_map(f, arr, na_result=np.nan, dtype=object):
# should really _check_ for NA
return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
Expand Down Expand Up @@ -2314,16 +2349,16 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
np.putmask(result, union_mask, np.nan)

not_masked = ~union_mask
result[not_masked] = cat_core([x[not_masked] for x in all_cols],
result[not_masked] = cat_safe([x[not_masked] for x in all_cols],
sep)
elif na_rep is not None and union_mask.any():
# fill NaNs with na_rep in case there are actually any NaNs
all_cols = [np.where(nm, na_rep, col)
for nm, col in zip(na_masks, all_cols)]
result = cat_core(all_cols, sep)
result = cat_safe(all_cols, sep)
else:
# no NaNs - can just concatenate
result = cat_core(all_cols, sep)
result = cat_safe(all_cols, sep)

if isinstance(self._orig, Index):
# add dtype for case that result is all-NA
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,23 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep):
result = s.str.cat(t, sep=sep)
assert_series_or_index_equal(result, expected)

# test integer/float dtypes (inferred by constructor) and mixed
@pytest.mark.parametrize('data', [[1, 2, 3], [.1, .2, .3], [1, 2, 'b']],
ids=['integers', 'floats', 'mixed'])
# without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
@pytest.mark.parametrize('box', [Series, Index, list,
lambda x: np.array(x, dtype=object)],
ids=['Series', 'Index', 'list', 'np.array'])
def test_str_cat_wrong_dtype_raises(self, box, data):
# GH 22722
s = Series(['a', 'b', 'c'])
t = box(data)

msg = 'Concatenation requires list-likes containing only strings.*'
with pytest.raises(TypeError, match=msg):
# need to use outer and na_rep, as otherwise Index would not raise
s.str.cat(t, join='outer', na_rep='-')

@pytest.mark.parametrize('box', [Series, Index])
def test_str_cat_mixed_inputs(self, box):
s = Index(['a', 'b', 'c', 'd'])
Expand Down

0 comments on commit 5d0ff69

Please sign in to comment.