diff --git a/pandas/core/series.py b/pandas/core/series.py index d17b29a0d7349..c0a013e387a04 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,7 +29,7 @@ import pandas.core.generic as generic import pandas.core.nanops as nanops import pandas.lib as lib -from pandas.util.decorators import Appender, Substitution +from pandas.util.decorators import Appender, Substitution, cache_readonly from pandas.compat.scipy import scoreatpercentile as _quantile @@ -2626,6 +2626,10 @@ def tz_localize(self, tz, copy=True): return Series(new_values, index=new_index, name=self.name) + @cache_readonly + def str(self): + from pandas.core.strings import StringMethods + return StringMethods(self) _INDEX_TYPES = ndarray, Index, list, tuple diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 14939e3eb4476..3734a48ab53e2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,11 +1,92 @@ import numpy as np -from pandas.util.map import mapwrap, auto_map + +from functools import wraps +from itertools import izip +from pandas.core.common import isnull +from pandas.core.series import Series import re +import pandas.lib as lib +import pandas.core.common as com +import operator + +class repeat(object): + def __init__(self, obj): + self.obj = obj + + def __getitem__(self, i): + return self.obj + +class azip(object): + def __init__(self, *args): + self.cols = [] + for a in args: + if np.isscalar(a): + self.cols.append(repeat(a)) + else: + self.cols.append(a) + + def __getitem__(self, i): + return [col[i] for col in self.cols] + + +def map_iter_args(arr, f, otherargs, n_otherargs, required, n_results): + ''' + Substitute for np.vectorize with pandas-friendly dtype inference + + Parameters + ---------- + arr : ndarray + f : function + + Returns + ------- + mapped : ndarray + ''' + notnull = com.notnull + + n = len(arr) + result = np.empty((n, n_results), dtype=object) + for i, val in enumerate(arr): + args = otherargs[i] + if notnull(val) and all(notnull(args[r]) for r in required): + result[i] = f(val, *args) + else: + result[i] = [np.nan] * n_results -startswith = mapwrap(str.startswith) -contains = mapwrap(str.__contains__) -upper = mapwrap(str.upper) -lower = mapwrap(str.lower) + return [lib.maybe_convert_objects(col, try_float=0) for col in result.T] + + +def auto_map(arr, f, otherargs, n_results=1, required='all'): + from pandas.core.series import Series + + if all(np.isscalar(a) for a in otherargs): + res = lib.map_infer(arr, lambda v: f(v, *otherargs)) + return Series(res, index=arr.index, copy=False) + + n_otherargs = len(otherargs) + if required == 'all': + required = list(range(n_otherargs)) + res = map_iter_args(arr, f, azip(*otherargs), n_otherargs, + required, n_results) + res = [Series(col, index=arr.index, copy=False) for col in res] + if n_results == 1: + return res[0] + return res + + +def mapwrap(f, n_results_default=1, required='all'): + # @wraps(f) + + def wrapped(arr, n_results=None, *otherargs): + n_results = n_results or n_results_default + return auto_map(arr, f, otherargs, n_results, required) + + return wrapped + +startswith = mapwrap(lambda x, p: x.startswith(p)) +contains = mapwrap(lambda x, p: x.__contains__(p)) +upper = mapwrap(lambda x: x.upper()) +lower = mapwrap(lambda x: x.lower()) def _re_get_groups(pattern, n): def inner(s, *groups): @@ -13,13 +94,13 @@ def inner(s, *groups): if m: return m.group(*[int(g) for g in groups]) return np.nan if n == 1 else [np.nan] * n - + return inner def search_re(arr, pattern, groups=(0,)): if isinstance(pattern, str): pattern = re.compile(pattern) - + if isinstance(groups, np.ndarray): if groups.ndim == 1: n_groups = 1 @@ -27,5 +108,614 @@ def search_re(arr, pattern, groups=(0,)): n_groups = groups.shape[1] else: n_groups = len(groups) - - return auto_map(arr, _re_get_groups(pattern, n_groups), (groups,), n_results=n_groups) + + return auto_map(arr, _re_get_groups(pattern, n_groups), + (groups,), n_results=n_groups) + + +def _get_array_list(arr, others): + if isinstance(others[0], (list, np.ndarray)): + arrays = [arr] + list(others) + else: + arrays = [arr, others] + + return [np.asarray(x, dtype=object) for x in arrays] + + +def str_cat(arr, others=None, sep=None, na_rep=None): + """ + Concatenate arrays of strings with given separator + + Parameters + ---------- + arr : list or array-like + others : list or array, or list of arrays + sep : string or None, default None + na_rep : string or None, default None + If None, an NA in any array will propagate + + Returns + ------- + concat : array + """ + if sep is None: + sep = '' + + if others is not None: + arrays = _get_array_list(arr, others) + + n = _length_check(arrays) + masks = np.array([isnull(x) for x in arrays]) + cats = None + + if na_rep is None: + na_mask = np.logical_or.reduce(masks, axis=0) + + result = np.empty(n, dtype=object) + np.putmask(result, na_mask, np.nan) + + notmask = -na_mask + + if sep is None: + for x in arrays: + x = x[notmask] + if cats is None: + cats = x + else: + cats = cats + x[notmask] + else: + tuples = izip(*[x[notmask] for x in arrays]) + cats = [sep.join(tup) for tup in tuples] + + result[notmask] = cats + else: + for i, x in enumerate(arrays): + x = np.where(masks[i], na_rep, x) + if cats is None: + cats = x + else: + cats = cats + sep + x + + result = cats + + return result + else: + arr = np.asarray(arr, dtype=object) + mask = isnull(arr) + if na_rep is None and mask.any(): + return np.nan + return sep.join(np.where(mask, na_rep, arr)) + + +def _length_check(others): + n = None + for x in others: + if n is None: + n = len(x) + elif len(x) != n: + raise ValueError('All arrays must be same length') + + return n + + +def _na_map(f, arr, na_result=np.nan): + # should really _check_ for NA + def g(x): + try: + return f(x) + except (TypeError, AttributeError): + return na_result + return _map(g, arr) + + +def _map(f, arr): + if not isinstance(arr, np.ndarray): + arr = np.asarray(arr, dtype=object) + return lib.map_infer(arr, f) + + +def str_count(arr, pat): + """ + Count occurrences of pattern in each string + + Parameters + ---------- + arr : list or array-like + pat : string, valid regular expression + + Returns + ------- + counts : arrays + """ + regex = re.compile(pat) + f = lambda x: len(regex.findall(x)) + return _na_map(f, arr) + + +def str_contains(arr, pat, case=True): + """ + Check whether given pattern is contained in each string in the array + + Parameters + ---------- + pat : string + Character sequence or regular expression + case : boolean, default True + If True, case sensitive + + Returns + ------- + + """ + if not case: + regex = re.compile(pat, re.IGNORECASE) + else: + regex = re.compile(pat) + f = lambda x: bool(regex.search(x)) + return _na_map(f, arr) + + +def str_startswith(arr, pat): + """ + Return boolean array indicating whether each string starts with passed + pattern + + Parameters + ---------- + pat : string + Character sequence + + Returns + ------- + startswith : array (boolean) + """ + f = lambda x: x.startswith(pat) + return _na_map(f, arr) + + +def str_endswith(arr, pat): + """ + Return boolean array indicating whether each string ends with passed + pattern + + Parameters + ---------- + pat : string + Character sequence + + Returns + ------- + endswith : array (boolean) + """ + f = lambda x: x.endswith(pat) + return _na_map(f, arr) + + +def str_lower(arr): + """ + Convert strings in array to lowercase + + Returns + ------- + lowercase : array + """ + return _na_map(lambda x: x.lower(), arr) + + +def str_upper(arr): + """ + Convert strings in array to uppercase + + Returns + ------- + uppercase : array + """ + return _na_map(lambda x: x.upper(), arr) + + +def str_replace(arr, pat, repl, n=0, case=True): + """ + Replace + + Parameters + ---------- + pat : string + Character sequence or regular expression + repl : string + Replacement sequence + n : int, default 0 (all) + Number of replacements to make from start + case : boolean, default True + If True, case sensitive + + Returns + ------- + replaced : array + """ + if not case: + regex = re.compile(pat, re.IGNORECASE) + else: + regex = re.compile(pat) + def f(x): + return regex.sub(repl, x, count=n) + + return _na_map(f, arr) + +def str_repeat(arr, repeats): + """ + Duplicate each string in the array by indicated number of times + + Parameters + ---------- + repeats : int or array + Same value for all (int) or different value per (array) + + Returns + ------- + repeated : array + """ + if np.isscalar(repeats): + def rep(x): + try: + return str.__mul__(x, repeats) + except TypeError: + return unicode.__mul__(x, repeats) + return _na_map(rep, arr) + else: + def rep(x, r): + try: + return str.__mul__(x, r) + except TypeError: + return unicode.__mul__(x, r) + repeats = np.asarray(repeats, dtype=object) + result = lib.vec_binop(arr, repeats, rep) + return result + +def str_match(arr, pat): + """ + Find groups in each string (from beginning) using passed regular expression + + Parameters + ---------- + pat : string + Pattern or regular expression + + Returns + ------- + matches : array + """ + regex = re.compile(pat) + def f(x): + m = regex.match(x) + if m: + return m.groups() + else: + return [] + + return _na_map(f, arr) + + + +def str_join(arr, sep): + """ + Join lists contained as elements in array, a la str.join + + Parameters + ---------- + sep : string + Delimiter + + Returns + ------- + joined : array + """ + return _na_map(sep.join, arr) + + +def str_len(arr): + """ + Compute length of each string in array. + + Returns + ------- + lengths : array + """ + return _na_map(len, arr) + + + +def str_findall(arr, pat): + """ + Find all occurrences of pattern or regular expression + + Parameters + ---------- + pat : string + Pattern or regular expressino + + Returns + ------- + matches : array + """ + regex = re.compile(pat) + return _na_map(regex.findall, arr) + + +def str_pad(arr, width, side='left'): + """ + Pad strings with whitespace + + Parameters + ---------- + arr : list or array-like + width : int + Minimum width of resulting string; additional characters will be filled + with spaces + side : {'left', 'right', 'both'}, default 'left' + + Returns + ------- + padded : array + """ + if side == 'left': + f = lambda x: x.rjust(width) + elif side == 'right': + f = lambda x: x.ljust(width) + elif side == 'both': + f = lambda x: x.center(width) + else: # pragma: no cover + raise ValueError('Invalid side') + + return _na_map(f, arr) + + +def str_center(arr, width): + """ + "Center" strings, filling left and right side with additional whitespace + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with spaces + + Returns + ------- + centered : array + """ + return str_pad(arr, width, side='both') + + +def str_split(arr, pat, n=0): + """ + Split each string (a la re.split) in array by given pattern, propagating NA + values + + Parameters + ---------- + pat : string + String or regular expression to split on + n : int, default 0 (all) + + Returns + ------- + split : array + """ + regex = re.compile(pat) + f = lambda x: regex.split(x, maxsplit=n) + return _na_map(f, arr) + + +def str_slice(arr, start=None, stop=None): + """ + Slice substrings from each element in array + + Parameters + ---------- + start : int or None + stop : int or None + + Returns + ------- + sliced : array + """ + obj = slice(start, stop) + f = lambda x: x[obj] + return _na_map(f, arr) + + +def str_slice_replace(arr, start=None, stop=None, repl=None): + """ + + Parameters + ---------- + + Returns + ------- + replaced : array + """ + raise NotImplementedError + + +def str_strip(arr): + """ + Strip whitespace (including newlines) from each string in the array + + Returns + ------- + stripped : array + """ + return _na_map(lambda x: x.strip(), arr) + + +def str_lstrip(arr): + """ + Strip whitespace (including newlines) from left side of each string in the + array + + Returns + ------- + stripped : array + """ + return _na_map(lambda x: x.lstrip(), arr) + + +def str_rstrip(arr): + """ + Strip whitespace (including newlines) from right side of each string in the + array + + Returns + ------- + stripped : array + """ + return _na_map(lambda x: x.rstrip(), arr) + + +def str_wrap(arr, width=80): + """ + Wrap long strings to be formatted in paragraphs + + Parameters + ---------- + width : int + Maximum line-width + + Returns + ------- + wrapped : array + """ + raise NotImplementedError + +def str_get(arr, i): + """ + Extract element from lists, tuples, or strings in each element in the array + + Parameters + ---------- + i : int + Integer index (location) + + Returns + ------- + items : array + """ + f = lambda x: x[i] + return _na_map(f, arr) + +def _noarg_wrapper(f): + def wrapper(self): + result = f(self.series) + return self._wrap_result(result) + + wrapper.__name__ = f.__name__ + if f.__doc__: + wrapper.__doc__ = f.__doc__ + + return wrapper + + +def _pat_wrapper(f): + def wrapper(self, pat): + result = f(self.series, pat) + return self._wrap_result(result) + + wrapper.__name__ = f.__name__ + if f.__doc__: + wrapper.__doc__ = f.__doc__ + + return wrapper + +def copy(source): + "Copy a docstring from another source function (if present)" + def do_copy(target): + if source.__doc__: + target.__doc__ = source.__doc__ + return target + return do_copy + + +class StringMethods(object): + """ + Vectorized string functions for Series. NAs stay NA unless handled + otherwise by a particular method. Patterned after Python's string methods, + with some inspiration from R's stringr package. + + Examples + -------- + >>> s.str.split('_') + >>> s.str.replace('_', '') + """ + def __init__(self, series): + self.series = series + + def _wrap_result(self, result): + return Series(result, index=self.series.index, + name=self.series.name) + + @copy(str_cat) + def cat(self, others=None, sep=None, na_rep=None): + result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep) + return self._wrap_result(result) + + @copy(str_split) + def split(self, pat, n=0): + result = str_split(self.series, pat, n=n) + return self._wrap_result(result) + + @copy(str_get) + def get(self, i): + result = str_get(self.series, i) + return self._wrap_result(result) + + @copy(str_join) + def join(self, sep): + result = str_join(self.series, sep) + return self._wrap_result(result) + + @copy(str_contains) + def contains(self, pat, case=True): + result = str_contains(self.series, pat, case=case) + return self._wrap_result(result) + + @copy(str_replace) + def replace(self, pat, repl, n=0, case=True): + result = str_replace(self.series, pat, repl, n=n, case=case) + return self._wrap_result(result) + + @copy(str_repeat) + def repeat(self, repeats): + result = str_repeat(self.series, repeats) + return self._wrap_result(result) + + @copy(str_pad) + def pad(self, width, side='left'): + result = str_pad(self.series, width, side=side) + return self._wrap_result(result) + + @copy(str_center) + def center(self, width): + result = str_center(self.series, width) + return self._wrap_result(result) + + @copy(str_slice) + def slice(self, start=None, stop=None): + result = str_slice(self.series, start, stop) + return self._wrap_result(result) + + @copy(str_slice) + def slice_replace(self, i=None, j=None): + raise NotImplementedError + + count = _pat_wrapper(str_count) + startswith = _pat_wrapper(str_startswith) + endswith = _pat_wrapper(str_endswith) + findall = _pat_wrapper(str_findall) + match = _pat_wrapper(str_match) + + len = _noarg_wrapper(str_len) + strip = _noarg_wrapper(str_strip) + rstrip = _noarg_wrapper(str_rstrip) + lstrip = _noarg_wrapper(str_lstrip) + lower = _noarg_wrapper(str_lower) + upper = _noarg_wrapper(str_upper) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py new file mode 100644 index 0000000000000..7512330b6ae15 --- /dev/null +++ b/pandas/tests/test_strings.py @@ -0,0 +1,631 @@ +# pylint: disable-msg=E1101,W0612 + +from datetime import datetime, timedelta, date +import os +import operator +import unittest + +import nose + +from numpy import nan as NA +import numpy as np + +from pandas import (Index, Series, TimeSeries, DataFrame, isnull, notnull, + bdate_range, date_range) +import pandas.core.common as com + +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +import pandas.core.strings as strings + +class TestStringMethods(unittest.TestCase): + + def test_cat(self): + one = ['a', 'a', 'b', 'b', 'c', NA] + two = ['a', NA, 'b', 'd', 'foo', NA] + + # single array + result = strings.str_cat(one) + self.assert_(isnull(result)) + + result = strings.str_cat(one, na_rep='NA') + exp = 'aabbcNA' + self.assertEquals(result, exp) + + result = strings.str_cat(one, na_rep='-') + exp = 'aabbc-' + self.assertEquals(result, exp) + + result = strings.str_cat(one, sep='_', na_rep='NA') + exp = 'a_a_b_b_c_NA' + self.assertEquals(result, exp) + + # Multiple arrays + result = strings.str_cat(one, [two], na_rep='NA') + exp = ['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'] + self.assert_(np.array_equal(result, exp)) + + result = strings.str_cat(one, two) + exp = ['aa', NA, 'bb', 'bd', 'cfoo', NA] + tm.assert_almost_equal(result, exp) + + def test_count(self): + values = ['foo', 'foofoo', NA, 'foooofooofommmfoo'] + + result = strings.str_count(values, 'f[o]+') + exp = [1, 2, NA, 4] + tm.assert_almost_equal(result, exp) + + result = Series(values).str.count('f[o]+') + self.assert_(isinstance(result, Series)) + tm.assert_almost_equal(result, exp) + + #mixed + mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + rs = strings.str_count(mixed, 'a') + xp = [1, NA, 0, NA, NA, 0, NA, NA, NA] + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.count('a') + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = [u'foo', u'foofoo', NA, u'foooofooofommmfoo'] + + result = strings.str_count(values, 'f[o]+') + exp = [1, 2, NA, 4] + tm.assert_almost_equal(result, exp) + + result = Series(values).str.count('f[o]+') + self.assert_(isinstance(result, Series)) + tm.assert_almost_equal(result, exp) + + def test_contains(self): + values = ['foo', NA, 'fooommm__foo', 'mmm_'] + pat = 'mmm[_]+' + + result = strings.str_contains(values, pat) + expected = [False, np.nan, True, True] + tm.assert_almost_equal(result, expected) + + values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] + result = strings.str_contains(values, pat) + expected = [False, False, True, True] + self.assert_(result.dtype == np.bool_) + tm.assert_almost_equal(result, expected) + + #mixed + mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + rs = strings.str_contains(mixed, 'o') + xp = [False, NA, False, NA, NA, True, NA, NA, NA] + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.contains('o') + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = [u'foo', NA, u'fooommm__foo', u'mmm_'] + pat = 'mmm[_]+' + + result = strings.str_contains(values, pat) + expected = [False, np.nan, True, True] + tm.assert_almost_equal(result, expected) + + values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] + result = strings.str_contains(values, pat) + expected = [False, False, True, True] + self.assert_(result.dtype == np.bool_) + tm.assert_almost_equal(result, expected) + + def test_startswith(self): + values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) + + result = values.str.startswith('foo') + exp = Series([False, NA, True, False, False, NA, True]) + tm.assert_series_equal(result, exp) + + #mixed + mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + rs = strings.str_startswith(mixed, 'f') + xp = [False, NA, False, NA, NA, True, NA, NA, NA] + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.startswith('f') + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'om', NA, u'foo_nom', u'nom', u'bar_foo', NA, + u'foo']) + + result = values.str.startswith('foo') + exp = Series([False, NA, True, False, False, NA, True]) + tm.assert_series_equal(result, exp) + + def test_endswith(self): + values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) + + result = values.str.endswith('foo') + exp = Series([False, NA, False, False, True, NA, True]) + tm.assert_series_equal(result, exp) + + #mixed + mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + rs = strings.str_endswith(mixed, 'f') + xp = [False, NA, False, NA, NA, False, NA, NA, NA] + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.endswith('f') + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'om', NA, u'foo_nom', u'nom', u'bar_foo', NA, + u'foo']) + + result = values.str.endswith('foo') + exp = Series([False, NA, False, False, True, NA, True]) + tm.assert_series_equal(result, exp) + + def test_lower_upper(self): + values = Series(['om', NA, 'nom', 'nom']) + + result = values.str.upper() + exp = Series(['OM', NA, 'NOM', 'NOM']) + tm.assert_series_equal(result, exp) + + result = result.str.lower() + tm.assert_series_equal(result, values) + + #mixed + mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, + 1, 2.]) + mixed = mixed.str.upper() + rs = Series(mixed).str.lower() + xp = ['a', NA, 'b', NA, NA, 'foo', NA, NA, NA] + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'om', NA, u'nom', u'nom']) + + result = values.str.upper() + exp = Series([u'OM', NA, u'NOM', u'NOM']) + tm.assert_series_equal(result, exp) + + result = result.str.lower() + tm.assert_series_equal(result, values) + + def test_replace(self): + values = Series(['fooBAD__barBAD', NA]) + + result = values.str.replace('BAD[_]*', '') + exp = Series(['foobar', NA]) + tm.assert_series_equal(result, exp) + + result = values.str.replace('BAD[_]*', '', n=1) + exp = Series(['foobarBAD', NA]) + tm.assert_series_equal(result, exp) + + #mixed + mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', + None, 1, 2.]) + + rs = Series(mixed).str.replace('BAD[_]*', '') + xp = ['a', NA, 'b', NA, NA, 'foo', NA, NA, NA] + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'fooBAD__barBAD', NA]) + + result = values.str.replace('BAD[_]*', '') + exp = Series([u'foobar', NA]) + tm.assert_series_equal(result, exp) + + result = values.str.replace('BAD[_]*', '', n=1) + exp = Series([u'foobarBAD', NA]) + tm.assert_series_equal(result, exp) + + def test_repeat(self): + values = Series(['a', 'b', NA, 'c', NA, 'd']) + + result = values.str.repeat(3) + exp = Series(['aaa', 'bbb', NA, 'ccc', NA, 'ddd']) + tm.assert_series_equal(result, exp) + + result = values.str.repeat([1, 2, 3, 4, 5, 6]) + exp = Series(['a', 'bb', NA, 'cccc', NA, 'dddddd']) + tm.assert_series_equal(result, exp) + + #mixed + mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', + None, 1, 2.]) + + rs = Series(mixed).str.repeat(3) + xp = ['aaa', NA, 'bbb', NA, NA, 'foofoofoo', NA, NA, NA] + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'a', u'b', NA, u'c', NA, u'd']) + + result = values.str.repeat(3) + exp = Series([u'aaa', u'bbb', NA, u'ccc', NA, u'ddd']) + tm.assert_series_equal(result, exp) + + result = values.str.repeat([1, 2, 3, 4, 5, 6]) + exp = Series([u'a', u'bb', NA, u'cccc', NA, u'dddddd']) + tm.assert_series_equal(result, exp) + + + def test_match(self): + values = Series(['fooBAD__barBAD', NA, 'foo']) + + result = values.str.match('.*(BAD[_]+).*(BAD)') + exp = Series([('BAD__', 'BAD'), NA, []]) + tm.assert_series_equal(result, exp) + + #mixed + mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), + 'foo', None, 1, 2.]) + + rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') + xp = [('BAD_', 'BAD'), NA, ('BAD_', 'BAD'), NA, NA, [], NA, NA, NA] + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'fooBAD__barBAD', NA, u'foo']) + + result = values.str.match('.*(BAD[_]+).*(BAD)') + exp = Series([(u'BAD__', u'BAD'), NA, []]) + tm.assert_series_equal(result, exp) + + def test_join(self): + values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + result = values.str.split('_').str.join('_') + tm.assert_series_equal(values, result) + + #mixed + mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), + 'foo', None, 1, 2.]) + + rs = Series(mixed).str.split('_').str.join('_') + xp = Series(['a_b', NA, 'asdf_cas_asdf', NA, NA, 'foo', NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'a_b_c', u'c_d_e', np.nan, u'f_g_h']) + result = values.str.split('_').str.join('_') + tm.assert_series_equal(values, result) + + def test_len(self): + values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) + + result = values.str.len() + exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + tm.assert_series_equal(result, exp) + + #mixed + mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), + 'foo', None, 1, 2.]) + + rs = Series(mixed).str.len() + xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'foo', u'fooo', u'fooooo', np.nan, u'fooooooo']) + + result = values.str.len() + exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + tm.assert_series_equal(result, exp) + + def test_findall(self): + values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD']) + + result = values.str.findall('BAD[_]*') + exp = Series([['BAD__', 'BAD'], NA, [], ['BAD']]) + tm.assert_almost_equal(result, exp) + + #mixed + mixed = Series(['fooBAD__barBAD', NA, 'foo', True, datetime.today(), + 'BAD', None, 1, 2.]) + + rs = Series(mixed).str.findall('BAD[_]*') + xp = Series([['BAD__', 'BAD'], NA, [], NA, NA, ['BAD'], NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'fooBAD__barBAD', NA, u'foo', u'BAD']) + + result = values.str.findall('BAD[_]*') + exp = Series([[u'BAD__', u'BAD'], NA, [], [u'BAD']]) + tm.assert_almost_equal(result, exp) + + def test_pad(self): + values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + + result = values.str.pad(5, side='left') + exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee']) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side='right') + exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee']) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side='both') + exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) + tm.assert_almost_equal(result, exp) + + #mixed + mixed = Series(['a', NA, 'b', True, datetime.today(), + 'ee', None, 1, 2.]) + + rs = Series(mixed).str.pad(5, side='left') + xp = Series([' a', NA, ' b', NA, NA, ' ee', NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + mixed = Series(['a', NA, 'b', True, datetime.today(), + 'ee', None, 1, 2.]) + + rs = Series(mixed).str.pad(5, side='right') + xp = Series(['a ', NA, 'b ', NA, NA, 'ee ', NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + mixed = Series(['a', NA, 'b', True, datetime.today(), + 'ee', None, 1, 2.]) + + rs = Series(mixed).str.pad(5, side='both') + xp = Series([' a ', NA, ' b ', NA, NA, ' ee ', NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'a', u'b', NA, u'c', NA, u'eeeeee']) + + result = values.str.pad(5, side='left') + exp = Series([u' a', u' b', NA, u' c', NA, u'eeeeee']) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side='right') + exp = Series([u'a ', u'b ', NA, u'c ', NA, u'eeeeee']) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side='both') + exp = Series([u' a ', u' b ', NA, u' c ', NA, u'eeeeee']) + tm.assert_almost_equal(result, exp) + + def test_center(self): + values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + + result = values.str.center(5) + exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) + tm.assert_almost_equal(result, exp) + + #mixed + mixed = Series(['a', NA, 'b', True, datetime.today(), + 'c', 'eee', None, 1, 2.]) + + rs = Series(mixed).str.center(5) + xp = Series([' a ', NA, ' b ', NA, NA, ' c ', ' eee ', NA, NA, + NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'a', u'b', NA, u'c', NA, u'eeeeee']) + + result = values.str.center(5) + exp = Series([u' a ', u' b ', NA, u' c ', NA, u'eeeeee']) + tm.assert_almost_equal(result, exp) + + def test_split(self): + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + + result = values.str.split('_') + exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + tm.assert_series_equal(result, exp) + + #mixed + mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), + None, 1, 2.]) + + rs = Series(mixed).str.split('_') + xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, + NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'a_b_c', u'c_d_e', NA, u'f_g_h']) + + result = values.str.split('_') + exp = Series([[u'a', u'b', u'c'], [u'c', u'd', u'e'], NA, + [u'f', u'g', u'h']]) + tm.assert_series_equal(result, exp) + + def test_slice(self): + values = Series(['aafootwo','aabartwo', NA, 'aabazqux']) + + result = values.str.slice(2, 5) + exp = Series(['foo', 'bar', NA, 'baz']) + tm.assert_series_equal(result, exp) + + #mixed + mixed = Series(['aafootwo', NA, 'aabartwo', True, datetime.today(), + None, 1, 2.]) + + rs = Series(mixed).str.slice(2, 5) + xp = Series(['foo', NA, 'bar', NA, NA, + NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'aafootwo', u'aabartwo', NA, u'aabazqux']) + + result = values.str.slice(2, 5) + exp = Series([u'foo', u'bar', NA, u'baz']) + tm.assert_series_equal(result, exp) + + def test_slice_replace(self): + pass + + def test_strip_lstrip_rstrip(self): + values = Series([' aa ', ' bb \n', NA, 'cc ']) + + result = values.str.strip() + exp = Series(['aa', 'bb', NA, 'cc']) + tm.assert_series_equal(result, exp) + + result = values.str.lstrip() + exp = Series(['aa ', 'bb \n', NA, 'cc ']) + tm.assert_series_equal(result, exp) + + result = values.str.rstrip() + exp = Series([' aa', ' bb', NA, 'cc']) + tm.assert_series_equal(result, exp) + + #mixed + mixed = Series([' aa ', NA, ' bb \t\n', True, datetime.today(), + None, 1, 2.]) + + rs = Series(mixed).str.strip() + xp = Series(['aa', NA, 'bb', NA, NA, + NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.lstrip() + xp = Series(['aa ', NA, 'bb \t\n', NA, NA, + NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.rstrip() + xp = Series([' aa', NA, ' bb', NA, NA, + NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u' aa ', u' bb \n', NA, u'cc ']) + + result = values.str.strip() + exp = Series([u'aa', u'bb', NA, u'cc']) + tm.assert_series_equal(result, exp) + + result = values.str.lstrip() + exp = Series([u'aa ', u'bb \n', NA, u'cc ']) + tm.assert_series_equal(result, exp) + + result = values.str.rstrip() + exp = Series([u' aa', u' bb', NA, u'cc']) + tm.assert_series_equal(result, exp) + + def test_wrap(self): + pass + + def test_get(self): + values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + + result = values.str.split('_').str.get(1) + expected = Series(['b', 'd', np.nan, 'g']) + tm.assert_series_equal(result, expected) + + #mixed + mixed = Series(['a_b_c', NA, 'c_d_e', True, datetime.today(), + None, 1, 2.]) + + rs = Series(mixed).str.split('_').str.get(1) + xp = Series(['b', NA, 'd', NA, NA, + NA, NA, NA]) + + self.assert_(isinstance(rs, Series)) + tm.assert_almost_equal(rs, xp) + + #unicode + values = Series([u'a_b_c', u'c_d_e', np.nan, u'f_g_h']) + + result = values.str.split('_').str.get(1) + expected = Series([u'b', u'd', np.nan, u'g']) + tm.assert_series_equal(result, expected) + + def test_more_contains(self): + # PR #1179 + import re + + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, + 'CABA', 'dog', 'cat']) + + result = s.str.contains('a') + expected = Series([False, False, False, True, True, False, np.nan, + False, False, True]) + assert_series_equal(result, expected) + + result = s.str.contains('a', case=False) + expected = Series([True, False, False, True, True, False, np.nan, + True, False, True]) + assert_series_equal(result, expected) + + result = s.str.contains('Aa') + expected = Series([False, False, False, True, False, False, np.nan, + False, False, False]) + assert_series_equal(result, expected) + + result = s.str.contains('ba') + expected = Series([False, False, False, True, False, False, np.nan, + False, False, False]) + assert_series_equal(result, expected) + + result = s.str.contains('ba', case=False) + expected = Series([False, False, False, True, True, False, np.nan, + True, False, False]) + assert_series_equal(result, expected) + + def test_more_replace(self): + # PR #1179 + import re + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', + '', NA, 'CABA', 'dog', 'cat']) + + result = s.str.replace('A', 'YYY') + expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA, + 'CYYYBYYY', 'dog', 'cat']) + assert_series_equal(result, expected) + + result = s.str.replace('A', 'YYY', case=False) + expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA, + 'CYYYBYYY', 'dog', 'cYYYt']) + assert_series_equal(result, expected) + + result = s.str.replace('^.a|dog', 'XX-XX ', case=False) + expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA, + 'XX-XX BA', 'XX-XX ', 'XX-XX t']) + assert_series_equal(result, expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/util/map.py b/pandas/util/map.py deleted file mode 100644 index 65ab1e974a930..0000000000000 --- a/pandas/util/map.py +++ /dev/null @@ -1,69 +0,0 @@ -import numpy as np -from pandas import _tseries as lib -from pandas import notnull, Series -from functools import wraps - -class repeat(object): - def __init__(self, obj): - self.obj = obj - - def __getitem__(self, i): - return self.obj - -class azip(object): - def __init__(self, *args): - self.cols = [] - for a in args: - if np.isscalar(a): - self.cols.append(repeat(a)) - else: - self.cols.append(a) - - def __getitem__(self, i): - return [col[i] for col in self.cols] - -def map_iter_args(arr, f, otherargs, n_otherargs, required, n_results): - ''' - Substitute for np.vectorize with pandas-friendly dtype inference - - Parameters - ---------- - arr : ndarray - f : function - - Returns - ------- - mapped : ndarray - ''' - n = len(arr) - result = np.empty((n, n_results), dtype=object) - for i, val in enumerate(arr): - args = otherargs[i] - if notnull(val) and all(notnull(args[r]) for r in required): - result[i] = f(val, *args) - else: - result[i] = [np.nan] * n_results - - return [lib.maybe_convert_objects(col, try_float=0) for col in result.T] - -def auto_map(arr, f, otherargs, n_results=1, required='all'): - if all(np.isscalar(a) for a in otherargs): - res = lib.map_infer(arr, lambda v: f(v, *otherargs)) - return Series(res, index=arr.index, copy=False) - - n_otherargs = len(otherargs) - if required == 'all': - required = list(range(n_otherargs)) - res = map_iter_args(arr, f, azip(*otherargs), n_otherargs, required, n_results) - res = [Series(col, index=arr.index, copy=False) for col in res] - if n_results == 1: - return res[0] - return res - -def mapwrap(f, n_results_default=1, required='all'): - @wraps(f) - def wrapped(arr, n_results=None, *otherargs): - n_results = n_results or n_results_default - return auto_map(arr, f, otherargs, n_results, required) - - return wrapped