diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 33d1219b3b11f..78b0a54b8893f 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -960,6 +960,9 @@ importantly, these methods exclude missing/NA values automatically. These are accessed via the Series's ``str`` attribute and generally have names matching the equivalent (scalar) build-in string methods: +Splitting and Replacing Strings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. ipython:: python s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) @@ -990,11 +993,12 @@ Methods like ``replace`` and ``findall`` take regular expressions, too: s3 s3.str.replace('^.a|dog', 'XX-XX ', case=False) -The method ``match`` returns the groups in a regular expression in one tuple. -Starting in pandas version 0.13.0, the method ``extract`` is available to -accomplish this more conveniently. +Extracting Substrings +~~~~~~~~~~~~~~~~~~~~~ -Extracting a regular expression with one group returns a Series of strings. +The method ``extract`` (introduced in version 0.13) accepts regular expressions +with match groups. Extracting a regular expression with one group returns +a Series of strings. .. ipython:: python @@ -1016,18 +1020,34 @@ Named groups like .. ipython:: python - Series(['a1', 'b2', 'c3']).str.match('(?P[ab])(?P\d)') + Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)') and optional groups like .. ipython:: python - Series(['a1', 'b2', '3']).str.match('(?P[ab])?(?P\d)') + Series(['a1', 'b2', '3']).str.extract('(?P[ab])?(?P\d)') can also be used. -Methods like ``contains``, ``startswith``, and ``endswith`` takes an extra -``na`` arguement so missing values can be considered True or False: +Testing for Strings that Match or Contain a Pattern +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In previous versions, *extracting* match groups was accomplished by ``match``, +which returned a not-so-convenient Series of tuples. Starting in version 0.14, +the default behavior of match will change. It will return a boolean +indexer, analagous to the method ``contains``. + +The distinction between +``match`` and ``contains`` is strictness: ``match`` relies on +strict ``re.match`` while ``contains`` relies on ``re.search``. + +In version 0.13, ``match`` performs its old, deprecated behavior by default, +but the new behavior is availabe through the keyword argument +``as_indexer=True``. + +Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take + an extra ``na`` arguement so missing values can be considered True or False: .. ipython:: python diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 6bf32b2343084..3ff0477678d79 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -102,6 +102,14 @@ Deprecated in 0.13.0 - deprecated ``iterkv``, which will be removed in a future release (this was an alias of iteritems used to bypass ``2to3``'s changes). (:issue:`4384`, :issue:`4375`, :issue:`4372`) +- deprecated the string method ``match``, whose role is now performed more + idiomatically by ``extract``. In a future release, the default behavior + of ``match`` will change to become analogous to ``contains``, which returns + a boolean indexer. (Their + distinction is strictness: ``match`` relies on ``re.match`` while + ``contains`` relies on ``re.serach``.) In this release, the deprecated + behavior is the default, but the new behavior is available through the + keyword argument ``as_indexer=True``. Indexing API Changes ~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2c47911318238..e5e3f9866dc52 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -7,7 +7,7 @@ import pandas.compat as compat import re import pandas.lib as lib - +import warnings def _get_array_list(arr, others): if isinstance(others[0], (list, np.ndarray)): @@ -169,6 +169,10 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan): regex = re.compile(pat, flags=flags) + if regex.groups > 0: + warnings.warn("""This pattern has match groups. To actually get the +groups, use str.extract.""", UserWarning) + f = lambda x: bool(regex.search(x)) return _na_map(f, arr, na) @@ -303,35 +307,70 @@ def rep(x, r): return result -def str_match(arr, pat, flags=0): +def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): """ - Find groups in each string (from beginning) using passed regular expression + Deprecated: Find groups in each string using passed regular expression. + If as_indexer=True, determine if each string matches a regular expression. Parameters ---------- pat : string - Pattern or regular expression + Character sequence or regular expression + case : boolean, default True + If True, case sensitive flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE + na : default NaN, fill value for missing values. + as_indexer : False, by default, gives deprecated behavior better achieved + using str_extract. True return boolean indexer. + Returns ------- - matches : array + matches : boolean array (if as_indexer=True) + matches : array of tuples (if as_indexer=False, default but deprecated) + + Note + ---- + To extract matched groups, which is the deprecated behavior of match, use + str.extract. """ + + if not case: + flags |= re.IGNORECASE + regex = re.compile(pat, flags=flags) - def f(x): - m = regex.match(x) - if m: - return m.groups() - else: - return [] + if (not as_indexer) and regex.groups > 0: + # Do this first, to make sure it happens even if the re.compile + # raises below. + warnings.warn("""In future versions of pandas, match will change +to always return a bool indexer.""", UserWarning) + + if as_indexer and regex.groups > 0: + warnings.warn("""This pattern has match groups. To actually get the +groups, use str.extract.""", UserWarning) + + # If not as_indexer and regex.groups == 0, this returns empty lists + # and is basically useless, so we will not warn. + + if (not as_indexer) and regex.groups > 0: + def f(x): + m = regex.match(x) + if m: + return m.groups() + else: + return [] + else: + # This is the new behavior of str_match. + f = lambda x: bool(regex.match(x)) return _na_map(f, arr) + def str_extract(arr, pat, flags=0): """ - Find groups in each string (from beginning) using passed regular expression + Find groups in each string using passed regular expression Parameters ---------- @@ -358,7 +397,7 @@ def str_extract(arr, pat, flags=0): def f(x): if not isinstance(x, compat.string_types): return None - m = regex.match(x) + m = regex.search(x) if m: return m.groups()[0] # may be None else: @@ -368,7 +407,7 @@ def f(x): def f(x): if not isinstance(x, compat.string_types): return empty_row - m = regex.match(x) + m = regex.search(x) if m: return Series(list(m.groups())) # may contain None else: @@ -668,13 +707,13 @@ def wrapper(self): return wrapper -def _pat_wrapper(f, flags=False, na=False): +def _pat_wrapper(f, flags=False, na=False, **kwargs): def wrapper1(self, pat): result = f(self.series, pat) return self._wrap_result(result) - def wrapper2(self, pat, flags=0): - result = f(self.series, pat, flags=flags) + def wrapper2(self, pat, flags=0, **kwargs): + result = f(self.series, pat, flags=flags, **kwargs) return self._wrap_result(result) def wrapper3(self, pat, na=np.nan): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1be9013ce7575..29bdffd86a2c7 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -5,6 +5,7 @@ import operator import re import unittest +import warnings import nose @@ -392,10 +393,14 @@ def test_repeat(self): u('dddddd')]) tm.assert_series_equal(result, exp) - def test_match(self): + def test_deprecated_match(self): + # Old match behavior, deprecated (but still default) in 0.13 values = Series(['fooBAD__barBAD', NA, 'foo']) - result = values.str.match('.*(BAD[_]+).*(BAD)') + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + result = values.str.match('.*(BAD[_]+).*(BAD)') + assert issubclass(w[-1].category, UserWarning) exp = Series([('BAD__', 'BAD'), NA, []]) tm.assert_series_equal(result, exp) @@ -403,7 +408,10 @@ def test_match(self): mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), 'foo', None, 1, 2.]) - rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') + assert issubclass(w[-1].category, UserWarning) xp = [('BAD_', 'BAD'), NA, ('BAD_', 'BAD'), NA, NA, [], NA, NA, NA] tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -411,10 +419,52 @@ def test_match(self): # unicode values = Series([u('fooBAD__barBAD'), NA, u('foo')]) - result = values.str.match('.*(BAD[_]+).*(BAD)') + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + result = values.str.match('.*(BAD[_]+).*(BAD)') + assert issubclass(w[-1].category, UserWarning) exp = Series([(u('BAD__'), u('BAD')), NA, []]) tm.assert_series_equal(result, exp) + def test_match(self): + # New match behavior introduced in 0.13 + values = Series(['fooBAD__barBAD', NA, 'foo']) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + assert issubclass(w[-1].category, UserWarning) + exp = Series([True, NA, False]) + tm.assert_series_equal(result, exp) + + # If no groups, use new behavior even when as_indexer is False. + # (Old behavior is pretty much useless in this case.) + values = Series(['fooBAD__barBAD', NA, 'foo']) + result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False) + exp = Series([True, NA, False]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), + 'foo', None, 1, 2.]) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + assert issubclass(w[-1].category, UserWarning) + xp = [True, NA, True, NA, NA, False, NA, NA, NA] + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('fooBAD__barBAD'), NA, u('foo')]) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + assert issubclass(w[-1].category, UserWarning) + exp = Series([True, NA, False]) + tm.assert_series_equal(result, exp) + def test_extract(self): # Contains tests like those in test_match and some others. @@ -966,7 +1016,10 @@ def test_match_findall_flags(self): pat = pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' - result = data.str.match(pat, flags=re.IGNORECASE) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + result = data.str.match(pat, flags=re.IGNORECASE) + assert issubclass(w[-1].category, UserWarning) self.assertEquals(result[0], ('dave', 'google', 'com')) result = data.str.findall(pat, flags=re.IGNORECASE) @@ -975,7 +1028,10 @@ def test_match_findall_flags(self): result = data.str.count(pat, flags=re.IGNORECASE) self.assertEquals(result[0], 1) - result = data.str.contains(pat, flags=re.IGNORECASE) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + result = data.str.contains(pat, flags=re.IGNORECASE) + assert issubclass(w[-1].category, UserWarning) self.assertEquals(result[0], True) def test_encode_decode(self):