From cd9aa243b6d9e2a8faceaa9379c4872ff7e27488 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 2 Jun 2019 19:34:42 +0200
Subject: [PATCH 1/4] Better TypeError for wrong dtype in str.cat

---
 doc/source/whatsnew/v0.25.0.rst |  2 +-
 pandas/core/strings.py          | 56 +++++++++++++++++++++++----------
 pandas/tests/test_strings.py    | 16 ++++++++++
 3 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index 1619ba1a45739..3e8383c66abc2 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -575,7 +575,7 @@ Strings
 ^^^^^^^
 
 - Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`)
--
+- Improved error message when passing ``Series`` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`)
 -
 
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index bd756491abd2f..218577016fd0e 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2280,6 +2280,23 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
                                  'must all be of the same length as the '
                                  'calling Series/Index.')
 
+        # data has already been checked by _validate to be of correct dtype,
+        # but others could still have Series of dtypes (e.g. integers) which
+        # will necessarily fail in concatenation. To avoid deep and confusing
+        # traces, we raise here for anything that's not object or all-NA float.
+        def _legal_dtype(series):
+            # unify dtype handling between categorical/non-categorical
+            dtype = (series.dtype if not is_categorical_dtype(series)
+                     else series.cat.categories.dtype)
+            legal = dtype == 'O' or (dtype == 'float' and series.isna().all())
+            return legal
+        err_wrong_dtype = ('Can only concatenate list-likes containing only '
+                           'strings (or missing values).')
+        if any(not _legal_dtype(x) for x in others):
+            raise TypeError(err_wrong_dtype + ' Received list-like of dtype: '
+                            '{}'.format([x.dtype for x in others
+                                         if not _legal_dtype(x)][0]))
+
         if join is None and warn:
             warnings.warn("A future version of pandas will perform index "
                           "alignment when `others` is a Series/Index/"
@@ -2307,23 +2324,28 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
         na_masks = np.array([isna(x) for x in all_cols])
         union_mask = np.logical_or.reduce(na_masks, axis=0)
 
-        if na_rep is None and union_mask.any():
-            # no na_rep means NaNs for all rows where any column has a NaN
-            # only necessary if there are actually any NaNs
-            result = np.empty(len(data), dtype=object)
-            np.putmask(result, union_mask, np.nan)
-
-            not_masked = ~union_mask
-            result[not_masked] = cat_core([x[not_masked] for x in all_cols],
-                                          sep)
-        elif na_rep is not None and union_mask.any():
-            # fill NaNs with na_rep in case there are actually any NaNs
-            all_cols = [np.where(nm, na_rep, col)
-                        for nm, col in zip(na_masks, all_cols)]
-            result = cat_core(all_cols, sep)
-        else:
-            # no NaNs - can just concatenate
-            result = cat_core(all_cols, sep)
+        # if there are any non-string, non-null values hidden within an object
+        # dtype, cat_core will fail; catch error and return with better message
+        try:
+            if na_rep is None and union_mask.any():
+                # no na_rep means NaNs for all rows where any column has a NaN
+                # only necessary if there are actually any NaNs
+                result = np.empty(len(data), dtype=object)
+                np.putmask(result, union_mask, np.nan)
+
+                not_masked = ~union_mask
+                result[not_masked] = cat_core([x[not_masked]
+                                               for x in all_cols], sep)
+            elif na_rep is not None and union_mask.any():
+                # fill NaNs with na_rep in case there are actually any NaNs
+                all_cols = [np.where(nm, na_rep, col)
+                            for nm, col in zip(na_masks, all_cols)]
+                result = cat_core(all_cols, sep)
+            else:
+                # no NaNs - can just concatenate
+                result = cat_core(all_cols, sep)
+        except TypeError:
+            raise TypeError(err_wrong_dtype)
 
         if isinstance(self._orig, Index):
             # add dtype for case that result is all-NA
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index 1ba0ef3918fb7..5011cead017c1 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -420,6 +420,22 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep):
             result = s.str.cat(t, sep=sep)
             assert_series_or_index_equal(result, expected)
 
+    # test integer/float dtypes (inferred by constructor) and mixed
+    @pytest.mark.parametrize('data', [[1, 2, 3], [.1, .2, .3], [1, 2, 'b']],
+                             ids=['integers', 'floats', 'mixed'])
+    # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
+    @pytest.mark.parametrize('box', [Series, Index, list,
+                                     lambda x: np.array(x, dtype=object)],
+                             ids=['Series', 'Index', 'list', 'np.array'])
+    def test_str_cat_wrong_dtype_raises(self, box, data):
+        # GH 22722
+        s = Series(['a', 'b', 'c'])
+        t = box(data)
+
+        msg = 'Can only concatenate list-likes containing only strings.*'
+        with pytest.raises(TypeError, match=msg):
+            s.str.cat(t, join='left')
+
     @pytest.mark.parametrize('box', [Series, Index])
     def test_str_cat_mixed_inputs(self, box):
         s = Index(['a', 'b', 'c', 'd'])

From fd710dea08469b672b7e751d8ec0ceb3a4f2c060 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 10 Jun 2019 16:17:48 +0200
Subject: [PATCH 2/4] Review (jreback)

---
 pandas/core/strings.py       | 77 ++++++++++++++++++------------------
 pandas/tests/test_strings.py |  5 ++-
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 218577016fd0e..d7f118f38cf0a 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -53,6 +53,27 @@ def cat_core(list_of_columns, sep):
     return np.sum(list_with_sep, axis=0)
 
 
+def cat_safe(list_of_columns, sep):
+    """
+    Auxiliary function for :meth:`str.cat`.
+
+    Same signature as cat_core, but handles TypeErrors in concatenation, which
+    happen if the Series in list_of columns have the wrong dtypes or content.
+    """
+    # if there are any non-string values (wrong dtype or hidden behind object
+    # dtype), np.sum will fail; catch error and return with better message
+    try:
+        result = cat_core(list_of_columns, sep)
+    except TypeError:
+        dtypes = [lib.infer_dtype(x, skipna=True) for x in list_of_columns]
+        illegal = [x not in ('string', 'empty') for x in dtypes]
+        first_offender = [x for x, y in zip(list_of_columns, illegal) if y][0]
+        raise TypeError('Concatenation requires list-likes containing only '
+                        'strings (or missing values). Offending values found '
+                        'in column {}'.format(first_offender)) from None
+    return result
+
+
 def _na_map(f, arr, na_result=np.nan, dtype=object):
     # should really _check_ for NA
     return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
@@ -2280,23 +2301,6 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
                                  'must all be of the same length as the '
                                  'calling Series/Index.')
 
-        # data has already been checked by _validate to be of correct dtype,
-        # but others could still have Series of dtypes (e.g. integers) which
-        # will necessarily fail in concatenation. To avoid deep and confusing
-        # traces, we raise here for anything that's not object or all-NA float.
-        def _legal_dtype(series):
-            # unify dtype handling between categorical/non-categorical
-            dtype = (series.dtype if not is_categorical_dtype(series)
-                     else series.cat.categories.dtype)
-            legal = dtype == 'O' or (dtype == 'float' and series.isna().all())
-            return legal
-        err_wrong_dtype = ('Can only concatenate list-likes containing only '
-                           'strings (or missing values).')
-        if any(not _legal_dtype(x) for x in others):
-            raise TypeError(err_wrong_dtype + ' Received list-like of dtype: '
-                            '{}'.format([x.dtype for x in others
-                                         if not _legal_dtype(x)][0]))
-
         if join is None and warn:
             warnings.warn("A future version of pandas will perform index "
                           "alignment when `others` is a Series/Index/"
@@ -2324,28 +2328,23 @@ def _legal_dtype(series):
         na_masks = np.array([isna(x) for x in all_cols])
         union_mask = np.logical_or.reduce(na_masks, axis=0)
 
-        # if there are any non-string, non-null values hidden within an object
-        # dtype, cat_core will fail; catch error and return with better message
-        try:
-            if na_rep is None and union_mask.any():
-                # no na_rep means NaNs for all rows where any column has a NaN
-                # only necessary if there are actually any NaNs
-                result = np.empty(len(data), dtype=object)
-                np.putmask(result, union_mask, np.nan)
-
-                not_masked = ~union_mask
-                result[not_masked] = cat_core([x[not_masked]
-                                               for x in all_cols], sep)
-            elif na_rep is not None and union_mask.any():
-                # fill NaNs with na_rep in case there are actually any NaNs
-                all_cols = [np.where(nm, na_rep, col)
-                            for nm, col in zip(na_masks, all_cols)]
-                result = cat_core(all_cols, sep)
-            else:
-                # no NaNs - can just concatenate
-                result = cat_core(all_cols, sep)
-        except TypeError:
-            raise TypeError(err_wrong_dtype)
+        if na_rep is None and union_mask.any():
+            # no na_rep means NaNs for all rows where any column has a NaN
+            # only necessary if there are actually any NaNs
+            result = np.empty(len(data), dtype=object)
+            np.putmask(result, union_mask, np.nan)
+
+            not_masked = ~union_mask
+            result[not_masked] = cat_safe([x[not_masked] for x in all_cols],
+                                          sep)
+        elif na_rep is not None and union_mask.any():
+            # fill NaNs with na_rep in case there are actually any NaNs
+            all_cols = [np.where(nm, na_rep, col)
+                        for nm, col in zip(na_masks, all_cols)]
+            result = cat_safe(all_cols, sep)
+        else:
+            # no NaNs - can just concatenate
+            result = cat_safe(all_cols, sep)
 
         if isinstance(self._orig, Index):
             # add dtype for case that result is all-NA
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index 749ce1da1c1bb..955554f60aa1f 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -440,9 +440,10 @@ def test_str_cat_wrong_dtype_raises(self, box, data):
         s = Series(['a', 'b', 'c'])
         t = box(data)
 
-        msg = 'Can only concatenate list-likes containing only strings.*'
+        msg = 'Concatenation requires list-likes containing only strings.*'
         with pytest.raises(TypeError, match=msg):
-            s.str.cat(t, join='left')
+            # need to use outer and na_rep, as otherwise Index would not raise
+            s.str.cat(t, join='outer', na_rep='-')
 
     @pytest.mark.parametrize('box', [Series, Index])
     def test_str_cat_mixed_inputs(self, box):

From 02f6429662ea83b481a2156549bff482c3fcf24a Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 12 Jun 2019 22:54:30 +0200
Subject: [PATCH 3/4] Review (jreback & simonjayhawkins)

---
 doc/source/whatsnew/v0.25.0.rst |  2 +-
 pandas/core/strings.py          | 32 +++++++++++++++++++++++---------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index 7f0e50525e61b..2021a553dd69b 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -607,7 +607,7 @@ Strings
 ^^^^^^^
 
 - Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`)
-- Improved error message when passing ``Series`` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`)
+- Improved error message when passing :class:`Series` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`)
 -
 
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index d7f118f38cf0a..4c767a6c4b2e4 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -58,19 +58,33 @@ def cat_safe(list_of_columns, sep):
     Auxiliary function for :meth:`str.cat`.
 
     Same signature as cat_core, but handles TypeErrors in concatenation, which
-    happen if the Series in list_of columns have the wrong dtypes or content.
+    happen if the arrays in list_of columns have the wrong dtypes or content.
+
+    Parameters
+    ----------
+    list_of_columns : list of numpy arrays
+        List of arrays to be concatenated with sep;
+        these arrays may not contain NaNs!
+    sep : string
+        The separator string for concatenating the columns
+
+    Returns
+    -------
+    nd.array
+        The concatenation of list_of_columns with sep
     """
-    # if there are any non-string values (wrong dtype or hidden behind object
-    # dtype), np.sum will fail; catch error and return with better message
     try:
         result = cat_core(list_of_columns, sep)
     except TypeError:
-        dtypes = [lib.infer_dtype(x, skipna=True) for x in list_of_columns]
-        illegal = [x not in ('string', 'empty') for x in dtypes]
-        first_offender = [x for x, y in zip(list_of_columns, illegal) if y][0]
-        raise TypeError('Concatenation requires list-likes containing only '
-                        'strings (or missing values). Offending values found '
-                        'in column {}'.format(first_offender)) from None
+        # if there are any non-string values (wrong dtype or hidden behind
+        # object dtype), np.sum will fail; catch and return with better message
+        for column in list_of_columns:
+            dtype = lib.infer_dtype(column, skipna=True)
+            if dtype not in ['string', 'empty']:
+                raise TypeError(
+                    'Concatenation requires list-likes containing only '
+                    'strings (or missing values). Offending values found in '
+                    'column {}'.format(dtype)) from None
     return result
 
 

From 9752aa7311ef7f9d80384d63d3a9b58ba00ab362 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 13 Jun 2019 22:00:59 +0200
Subject: [PATCH 4/4] Add typing to cat_core/cat_safe (review jreback)

---
 pandas/core/strings.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index fcac17e8bfdbe..413c0e73f8410 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2,7 +2,7 @@
 from functools import wraps
 import re
 import textwrap
-from typing import Dict
+from typing import Dict, List
 import warnings
 
 import numpy as np
@@ -31,7 +31,7 @@
 _shared_docs = dict()  # type: Dict[str, str]
 
 
-def cat_core(list_of_columns, sep):
+def cat_core(list_of_columns: List, sep: str):
     """
     Auxiliary function for :meth:`str.cat`
 
@@ -53,7 +53,7 @@ def cat_core(list_of_columns, sep):
     return np.sum(list_with_sep, axis=0)
 
 
-def cat_safe(list_of_columns, sep):
+def cat_safe(list_of_columns: List, sep: str):
     """
     Auxiliary function for :meth:`str.cat`.