From cf56ff1229827bc7c7a70472b1bf1dde60a8efb4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 2 Nov 2014 13:01:37 -0500 Subject: [PATCH] BUG: concat of series of dtype category converting to object dtype (GH8641) --- doc/source/whatsnew/v0.15.2.txt | 1 + pandas/core/categorical.py | 118 +++++++++++++++++++++-------- pandas/core/common.py | 118 +++++++++++++++++++++-------- pandas/core/generic.py | 5 +- pandas/core/internals.py | 79 +++++-------------- pandas/sparse/array.py | 43 +++++++++++ pandas/sparse/tests/test_sparse.py | 5 +- pandas/tests/test_categorical.py | 17 +++++ pandas/tests/test_series.py | 86 ++++++++++++++++++++- pandas/tools/merge.py | 10 ++- pandas/tools/tests/test_merge.py | 1 + pandas/tseries/common.py | 58 +++++++++++++- 12 files changed, 407 insertions(+), 134 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 66b839ed01a29..8ea79089f95e3 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -20,6 +20,7 @@ users upgrade to this version. API changes ~~~~~~~~~~~ +- Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`) .. _whatsnew_0152.enhancements: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index dd23897a3f7e9..414c4a8315e6d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -15,7 +15,12 @@ import pandas.core.common as com from pandas.util.decorators import cache_readonly -from pandas.core.common import isnull +from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull, + is_categorical_dtype, is_integer_dtype, is_object_dtype, + _possibly_infer_to_datetimelike, get_dtype_kinds, + is_list_like, _is_sequence, + _ensure_platform_int, _ensure_object, _ensure_int64, + _coerce_indexer_dtype, _values_from_object, take_1d) from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option from pandas.core import format as fmt @@ -69,11 +74,11 @@ def f(self, other): def _is_categorical(array): """ return if we are a categorical possibility """ - return isinstance(array, Categorical) or isinstance(array.dtype, com.CategoricalDtype) + return isinstance(array, Categorical) or isinstance(array.dtype, CategoricalDtype) def _maybe_to_categorical(array): """ coerce to a categorical if a series is given """ - if isinstance(array, com.ABCSeries): + if isinstance(array, ABCSeries): return array.values return array @@ -175,7 +180,7 @@ class Categorical(PandasObject): >>> a.min() 'c' """ - dtype = com.CategoricalDtype() + dtype = CategoricalDtype() """The dtype (always "category")""" ordered = None @@ -203,7 +208,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa if fastpath: # fast path - self._codes = com._coerce_indexer_dtype(values, categories) + self._codes = _coerce_indexer_dtype(values, categories) self.name = name self.categories = categories self.ordered = ordered @@ -223,11 +228,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa "use only 'categories'") # sanitize input - if com.is_categorical_dtype(values): + if is_categorical_dtype(values): # we are either a Series or a Categorical cat = values - if isinstance(values, com.ABCSeries): + if isinstance(values, ABCSeries): cat = values.values if categories is None: categories = cat.categories @@ -244,7 +249,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well - values = com._possibly_infer_to_datetimelike(values, convert_dates=True) + values = _possibly_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array @@ -286,11 +291,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa codes = _get_codes_for_values(values, categories) # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 - if com.is_integer_dtype(values) and not com.is_integer_dtype(categories): + if is_integer_dtype(values) and not is_integer_dtype(categories): warn("Values and categories have different dtypes. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) - if com.is_integer_dtype(values) and (codes == -1).all(): + if is_integer_dtype(values) and (codes == -1).all(): warn("None of the categories were found in values. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) @@ -302,7 +307,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa self.ordered = False if ordered is None else ordered self.categories = categories self.name = name - self._codes = com._coerce_indexer_dtype(codes, categories) + self._codes = _coerce_indexer_dtype(codes, categories) def copy(self): """ Copy constructor. """ @@ -409,7 +414,7 @@ def _validate_categories(cls, categories): # on categories with NaNs, int values would be converted to float. # Use "object" dtype to prevent this. if isnull(categories).any(): - without_na = np.array([x for x in categories if com.notnull(x)]) + without_na = np.array([x for x in categories if notnull(x)]) with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" @@ -617,7 +622,7 @@ def add_categories(self, new_categories, inplace=False): remove_unused_categories set_categories """ - if not com.is_list_like(new_categories): + if not is_list_like(new_categories): new_categories = [new_categories] already_included = set(new_categories) & set(self._categories) if len(already_included) != 0: @@ -627,7 +632,7 @@ def add_categories(self, new_categories, inplace=False): new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() cat._categories = new_categories - cat._codes = com._coerce_indexer_dtype(cat._codes, new_categories) + cat._codes = _coerce_indexer_dtype(cat._codes, new_categories) if not inplace: return cat @@ -662,7 +667,7 @@ def remove_categories(self, removals, inplace=False): remove_unused_categories set_categories """ - if not com.is_list_like(removals): + if not is_list_like(removals): removals = [removals] removals = set(list(removals)) not_included = removals - set(self._categories) @@ -696,7 +701,7 @@ def remove_unused_categories(self, inplace=False): """ cat = self if inplace else self.copy() _used = sorted(np.unique(cat._codes)) - new_categories = cat.categories.take(com._ensure_platform_int(_used)) + new_categories = cat.categories.take(_ensure_platform_int(_used)) new_categories = _ensure_index(new_categories) cat._codes = _get_codes_for_values(cat.__array__(), new_categories) cat._categories = new_categories @@ -734,7 +739,7 @@ def __array__(self, dtype=None): A numpy array of either the specified dtype or, if dtype==None (default), the same dtype as categorical.categories.dtype """ - ret = com.take_1d(self.categories.values, self._codes) + ret = take_1d(self.categories.values, self._codes) if dtype and dtype != self.categories.dtype: return np.asarray(ret, dtype) return ret @@ -822,8 +827,8 @@ def get_values(self): # if we are a period index, return a string repr if isinstance(self.categories, PeriodIndex): - return com.take_1d(np.array(self.categories.to_native_types(), dtype=object), - self._codes) + return take_1d(np.array(self.categories.to_native_types(), dtype=object), + self._codes) return np.array(self) @@ -1010,7 +1015,7 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs): else: - if not com.isnull(fill_value) and fill_value not in self.categories: + if not isnull(fill_value) and fill_value not in self.categories: raise ValueError("fill value must be in categories") mask = values==-1 @@ -1031,7 +1036,7 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): # but is passed thru internally assert isnull(fill_value) - codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) + codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) result = Categorical(codes, categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) return result @@ -1178,7 +1183,7 @@ def __setitem__(self, key, value): raise ValueError("Cannot set a Categorical with another, without identical " "categories") - rvalue = value if com.is_list_like(value) else [value] + rvalue = value if is_list_like(value) else [value] to_add = Index(rvalue).difference(self.categories) # no assignments of values not in categories, but it's always ok to set something to np.nan if len(to_add) and not isnull(to_add).all(): @@ -1221,7 +1226,7 @@ def __setitem__(self, key, value): # float categories do currently return -1 for np.nan, even if np.nan is included in the # index -> "repair" this here if isnull(rvalue).any() and isnull(self.categories).any(): - nan_pos = np.where(com.isnull(self.categories))[0] + nan_pos = np.where(isnull(self.categories))[0] lindexer[lindexer == -1] = nan_pos key = self._maybe_coerce_indexer(key) @@ -1304,7 +1309,7 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 - result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))), + result = Categorical(sorted(htable.mode_int64(_ensure_int64(self._codes[good]))), categories=self.categories,ordered=self.ordered, name=self.name, fastpath=True) return result @@ -1373,9 +1378,9 @@ def describe(self): categories = np.arange(0,len(self.categories)+1 ,dtype=object) categories[:-1] = self.categories categories[-1] = np.nan - result.index = categories.take(com._ensure_platform_int(result.index)) + result.index = categories.take(_ensure_platform_int(result.index)) else: - result.index = self.categories.take(com._ensure_platform_int(result.index)) + result.index = self.categories.take(_ensure_platform_int(result.index)) result = result.reindex(self.categories) result.index.name = 'categories' @@ -1447,23 +1452,72 @@ def _get_codes_for_values(values, categories): from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != categories.dtype: - values = com._ensure_object(values) - categories = com._ensure_object(categories) + values = _ensure_object(values) + categories = _ensure_object(categories) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(categories)) - t.map_locations(com._values_from_object(categories)) - return com._coerce_indexer_dtype(t.lookup(values), categories) + t.map_locations(_values_from_object(categories)) + return _coerce_indexer_dtype(t.lookup(values), categories) def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): return list_like if isinstance(list_like, list): return list_like - if (com._is_sequence(list_like) or isinstance(list_like, tuple) - or isinstance(list_like, types.GeneratorType)): + if (_is_sequence(list_like) or isinstance(list_like, tuple) + or isinstance(list_like, types.GeneratorType)): return list(list_like) elif np.isscalar(list_like): return [list_like] else: # is this reached? return [list_like] + +def _concat_compat(to_concat, axis=0): + """ + provide concatenation of an object/categorical array of arrays each of which is a single dtype + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + + Returns + ------- + a single array, preserving the combined dtypes + """ + + def convert_categorical(x): + # coerce to object dtype + if is_categorical_dtype(x.dtype): + return x.get_values() + return x.ravel() + + typs = get_dtype_kinds(to_concat) + if not len(typs-set(['object','category'])): + + # we only can deal with object & category types + pass + + else: + + # convert to object type and perform a regular concat + from pandas.core.common import _concat_compat + return _concat_compat([ np.array(x,copy=False).astype('object') for x in to_concat ],axis=axis) + + # we could have object blocks and categorical's here + # if we only have a single cateogoricals then combine everything + # else its a non-compat categorical + categoricals = [ x for x in to_concat if is_categorical_dtype(x.dtype) ] + objects = [ x for x in to_concat if is_object_dtype(x.dtype) ] + + # validate the categories + categories = None + for x in categoricals: + if categories is None: + categories = x.categories + if not categories.equals(x.categories): + raise ValueError("incompatible categories in categorical concat") + + # concat them + return Categorical(np.concatenate([ convert_categorical(x) for x in to_concat ],axis=axis), categories=categories) diff --git a/pandas/core/common.py b/pandas/core/common.py index f5de6c7da8914..759f5f1dfaf7a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2768,9 +2768,62 @@ def _check_as_is(x): self.queue.truncate(0) +def get_dtype_kinds(l): + """ + Parameters + ---------- + l : list of arrays + + Returns + ------- + a set of kinds that exist in this list of arrays + """ + + typs = set() + for arr in l: + + dtype = arr.dtype + if is_categorical_dtype(dtype): + typ = 'category' + elif isinstance(arr, ABCSparseArray): + typ = 'sparse' + elif is_datetime64_dtype(dtype): + typ = 'datetime' + elif is_timedelta64_dtype(dtype): + typ = 'timedelta' + elif is_object_dtype(dtype): + typ = 'object' + elif is_bool_dtype(dtype): + typ = 'bool' + else: + typ = dtype.kind + typs.add(typ) + return typs + def _concat_compat(to_concat, axis=0): + """ + provide concatenation of an array of arrays each of which is a single + 'normalized' dtypes (in that for example, if its object, then it is a non-datetimelike + provde a combined dtype for the resulting array the preserves the overall dtype if possible) + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + + Returns + ------- + a single array, preserving the combined dtypes + """ + # filter empty arrays - nonempty = [x for x in to_concat if x.shape[axis] > 0] + # 1-d dtypes always are included here + def is_nonempty(x): + try: + return x.shape[axis] > 0 + except Exception: + return True + nonempty = [x for x in to_concat if is_nonempty(x)] # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. @@ -2778,38 +2831,37 @@ def _concat_compat(to_concat, axis=0): # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. - if nonempty: - - is_datetime64 = [x.dtype == _NS_DTYPE for x in nonempty] - is_timedelta64 = [x.dtype == _TD_DTYPE for x in nonempty] - - if all(is_datetime64): - new_values = np.concatenate([x.view(np.int64) for x in nonempty], - axis=axis) - return new_values.view(_NS_DTYPE) - elif all(is_timedelta64): - new_values = np.concatenate([x.view(np.int64) for x in nonempty], - axis=axis) - return new_values.view(_TD_DTYPE) - elif any(is_datetime64) or any(is_timedelta64): - to_concat = [_to_pydatetime(x) for x in nonempty] - - return np.concatenate(to_concat, axis=axis) - - -def _to_pydatetime(x): - # coerce to an object dtyped - - if x.dtype == _NS_DTYPE: - shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) - x = x.reshape(shape) - elif x.dtype == _TD_DTYPE: - shape = x.shape - x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) - x = x.reshape(shape) - - return x + + typs = get_dtype_kinds(to_concat) + + # these are mandated to handle empties as well + if 'datetime' in typs or 'timedelta' in typs: + from pandas.tseries.common import _concat_compat + return _concat_compat(to_concat, axis=axis) + + elif 'sparse' in typs: + from pandas.sparse.array import _concat_compat + return _concat_compat(to_concat, axis=axis) + + elif 'category' in typs: + from pandas.core.categorical import _concat_compat + return _concat_compat(to_concat, axis=axis) + + if not nonempty: + + # we have all empties, but may need to coerce the result dtype to object if we + # have non-numeric type operands (numpy would otherwise cast this to float) + typs = get_dtype_kinds(to_concat) + if len(typs) != 1: + + if not len(typs-set(['i','u','f'])) or not len(typs-set(['bool','i','u'])): + # let numpy coerce + pass + else: + # coerce to object + to_concat = [ x.astype('object') for x in to_concat ] + + return np.concatenate(to_concat,axis=axis) def _where_compat(mask, arr1, arr2): if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bccc0e7b6be14..89178ba2d9dcc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -271,14 +271,15 @@ def _construct_axes_from_arguments(self, args, kwargs, require_all=False): return axes, kwargs @classmethod - def _from_axes(cls, data, axes): + def _from_axes(cls, data, axes, **kwargs): # for construction from BlockManager if isinstance(data, BlockManager): - return cls(data) + return cls(data, **kwargs) else: if cls._AXIS_REVERSED: axes = axes[::-1] d = cls._construct_axes_dict_from(cls, axes, copy=False) + d.update(kwargs) return cls(data, **d) def _get_axis_number(self, axis): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bb81258efe4c5..7ab3e4d8d9482 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -493,18 +493,6 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs): values[mask] = na_rep return values.tolist() - def _concat_blocks(self, blocks, values): - """ return the block concatenation """ - - # dispatch to a categorical to handle the concat - if self._holder is None: - - for b in blocks: - if b.is_categorical: - return b._concat_blocks(blocks,values) - - return self._holder(values[0]) - # block actions #### def copy(self, deep=True): values = self.values @@ -1759,34 +1747,6 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, ndim=self.ndim, placement=self.mgr_locs) - def _concat_blocks(self, blocks, values): - """ - validate that we can merge these blocks - - return the block concatenation - """ - - # we could have object blocks and categorical's here - # if we only have a single cateogoricals then combine everything - # else its a non-compat categorical - - categoricals = [ b for b in blocks if b.is_categorical ] - objects = [ b for b in blocks if not b.is_categorical and b.is_object ] - - # convert everything to object and call it a day - if len(objects) + len(categoricals) != len(blocks): - raise ValueError("try to combine non-object blocks and categoricals") - - # validate the categories - categories = None - for b in categoricals: - if categories is None: - categories = b.values.categories - if not categories.equals(b.values.categories): - raise ValueError("incompatible categories in categorical block merge") - - return self._holder(values[0], categories=categories) - def to_native_types(self, slicer=None, na_rep='', **kwargs): """ convert to our native types format, slicing if desired """ @@ -4102,22 +4062,15 @@ def get_empty_dtype_and_na(join_units): blk = join_units[0].block if blk is None: return np.float64, np.nan - else: - return blk.dtype, None has_none_blocks = False dtypes = [None] * len(join_units) - for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype - if not has_none_blocks and len(set(dtypes)) == 1: - # Unanimous decision, nothing to upcast. - return dtypes[0], None - # dtypes = set() upcast_classes = set() null_upcast_classes = set() @@ -4127,7 +4080,9 @@ def get_empty_dtype_and_na(join_units): if com.is_categorical_dtype(dtype): upcast_cls = 'category' - elif issubclass(dtype.type, (np.object_, np.bool_)): + elif issubclass(dtype.type, np.bool_): + upcast_cls = 'bool' + elif issubclass(dtype.type, np.object_): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' @@ -4150,6 +4105,11 @@ def get_empty_dtype_and_na(join_units): # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan + elif 'bool' in upcast_classes: + if has_none_blocks: + return np.dtype(np.object_), np.nan + else: + return np.dtype(np.bool_), None elif 'category' in upcast_classes: return com.CategoricalDtype(), np.nan elif 'float' in upcast_classes: @@ -4184,14 +4144,7 @@ def concatenate_join_units(join_units, concat_axis, copy): else: concat_values = com._concat_compat(to_concat, axis=concat_axis) - if any(unit.needs_block_conversion for unit in join_units): - - # need to ask the join unit block to convert to the underlying repr for us - blocks = [ unit.block for unit in join_units if unit.block is not None ] - return blocks[0]._concat_blocks(blocks, concat_values) - else: - return concat_values - + return concat_values def get_mgr_concatenation_plan(mgr, indexers): """ @@ -4231,6 +4184,7 @@ def get_mgr_concatenation_plan(mgr, indexers): plan = [] for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks), group=False): + assert placements.is_slice_like join_unit_indexers = indexers.copy() @@ -4442,6 +4396,14 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): missing_arr.fill(fill_value) return missing_arr + if not self.indexers: + if self.block.is_categorical: + # preserve the categoricals for validation in _concat_compat + return self.block.values + elif self.block.is_sparse: + # preserve the sparse array for validation in _concat_compat + return self.block.values + if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. @@ -4455,13 +4417,14 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. - return values.view() + values = values.view() + else: for ax, indexer in self.indexers.items(): values = com.take_nd(values, indexer, axis=ax, fill_value=fill_value) - return values + return values def _fast_count_smallints(arr): diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 38a5688ed96e8..b765fdb8d67be 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -529,3 +529,46 @@ def make_sparse(arr, kind='block', fill_value=nan): ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method, use_numexpr=False) + + + +def _concat_compat(to_concat, axis=0): + """ + provide concatenation of an sparse/dense array of arrays each of which is a single dtype + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + + Returns + ------- + a single array, preserving the combined dtypes + """ + + def convert_sparse(x, axis): + # coerce to native type + if isinstance(x, SparseArray): + x = x.get_values() + x = x.ravel() + if axis > 0: + x = np.atleast_2d(x) + return x + + typs = com.get_dtype_kinds(to_concat) + + # we have more than one type here, so densify and regular concat + to_concat = [ convert_sparse(x, axis) for x in to_concat ] + result = np.concatenate(to_concat,axis=axis) + + if not len(typs-set(['sparse','f','i'])): + + # we can remain sparse + result = SparseArray(result.ravel()) + + else: + + # coerce to object if needed + result = result.astype('object') + + return result diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 105f661f08b10..9197a4fc22b9c 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -168,6 +168,9 @@ def test_construct_DataFrame_with_sp_series(self): assert_sp_series_equal(df['col'], self.bseries) + result = df.iloc[:,0] + assert_sp_series_equal(result, self.bseries) + # blocking expected = Series({'col': 'float64:sparse'}) result = df.ftypes @@ -909,8 +912,8 @@ def test_dtypes(self): def test_str(self): df = DataFrame(np.random.randn(10000, 4)) df.ix[:9998] = np.nan - sdf = df.to_sparse() + sdf = df.to_sparse() str(sdf) def test_array_interface(self): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 624c6cf9688d6..dc82abfb40e02 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2246,6 +2246,23 @@ def f(): dfx['grade'].cat.categories self.assert_numpy_array_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) + # GH 8641 + # series concat not preserving category dtype + s = Series(list('abc'),dtype='category') + s2 = Series(list('abd'),dtype='category') + + def f(): + pd.concat([s,s2]) + self.assertRaises(ValueError, f) + + result = pd.concat([s,s],ignore_index=True) + expected = Series(list('abcabc')).astype('category') + tm.assert_series_equal(result, expected) + + result = pd.concat([s,s]) + expected = Series(list('abcabc'),index=[0,1,2,0,1,2]).astype('category') + tm.assert_series_equal(result, expected) + def test_append(self): cat = pd.Categorical(["a","b"], categories=["a","b"]) vals = [1,2] diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 938d171506461..9ecdcd2b12d75 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -6198,13 +6198,93 @@ def test_numpy_unique(self): # it works! result = np.unique(self.ts) + def test_concat_empty_series_dtypes_roundtrips(self): + + # round-tripping with self & like self + dtypes = map(np.dtype,['float64','int8','uint8','bool','m8[ns]','M8[ns]']) + + for dtype in dtypes: + self.assertEqual(pd.concat([Series(dtype=dtype)]).dtype, dtype) + self.assertEqual(pd.concat([Series(dtype=dtype), + Series(dtype=dtype)]).dtype, dtype) + + def int_result_type(dtype, dtype2): + typs = set([dtype.kind,dtype2.kind]) + if not len(typs-set(['i','u','b'])) and (dtype.kind == 'i' or dtype2.kind == 'i'): + return 'i' + elif not len(typs-set(['u','b'])) and (dtype.kind == 'u' or dtype2.kind == 'u'): + return 'u' + return None + + def float_result_type(dtype, dtype2): + typs = set([dtype.kind,dtype2.kind]) + if not len(typs-set(['f','i','u'])) and (dtype.kind == 'f' or dtype2.kind == 'f'): + return 'f' + return None + + def get_result_type(dtype, dtype2): + result = float_result_type(dtype, dtype2) + if result is not None: + return result + result = int_result_type(dtype, dtype2) + if result is not None: + return result + return 'O' + + for dtype in dtypes: + for dtype2 in dtypes: + if dtype == dtype2: + continue + + expected = get_result_type(dtype, dtype2) + result = pd.concat([Series(dtype=dtype), + Series(dtype=dtype2)]).dtype + self.assertEqual(result.kind, expected) + def test_concat_empty_series_dtypes(self): - self.assertEqual(pd.concat([Series(dtype=np.float64)]).dtype, np.float64) - self.assertEqual(pd.concat([Series(dtype=np.int8)]).dtype, np.int8) - self.assertEqual(pd.concat([Series(dtype=np.bool_)]).dtype, np.bool_) + # bools self.assertEqual(pd.concat([Series(dtype=np.bool_), Series(dtype=np.int32)]).dtype, np.int32) + self.assertEqual(pd.concat([Series(dtype=np.bool_), + Series(dtype=np.float32)]).dtype, np.object_) + + # datetimelike + self.assertEqual(pd.concat([Series(dtype='m8[ns]'), + Series(dtype=np.bool)]).dtype, np.object_) + self.assertEqual(pd.concat([Series(dtype='m8[ns]'), + Series(dtype=np.int64)]).dtype, np.object_) + self.assertEqual(pd.concat([Series(dtype='M8[ns]'), + Series(dtype=np.bool)]).dtype, np.object_) + self.assertEqual(pd.concat([Series(dtype='M8[ns]'), + Series(dtype=np.int64)]).dtype, np.object_) + self.assertEqual(pd.concat([Series(dtype='M8[ns]'), + Series(dtype=np.bool_), + Series(dtype=np.int64)]).dtype, np.object_) + + # categorical + self.assertEqual(pd.concat([Series(dtype='category'), + Series(dtype='category')]).dtype, 'category') + self.assertEqual(pd.concat([Series(dtype='category'), + Series(dtype='float64')]).dtype, np.object_) + self.assertEqual(pd.concat([Series(dtype='category'), + Series(dtype='object')]).dtype, 'category') + + # sparse + result = pd.concat([Series(dtype='float64').to_sparse(), + Series(dtype='float64').to_sparse()]) + self.assertEqual(result.dtype,np.float64) + self.assertEqual(result.ftype,'float64:sparse') + + result = pd.concat([Series(dtype='float64').to_sparse(), + Series(dtype='float64')]) + self.assertEqual(result.dtype,np.float64) + self.assertEqual(result.ftype,'float64:sparse') + + result = pd.concat([Series(dtype='float64').to_sparse(), + Series(dtype='object')]) + self.assertEqual(result.dtype,np.object_) + self.assertEqual(result.ftype,'object:dense') def test_searchsorted_numeric_dtypes_scalar(self): s = Series([1, 2, 90, 1000, 3e9]) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 7a89c317a69c6..2f0920b6d4e98 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -854,11 +854,17 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.new_axes = self._get_new_axes() def get_result(self): + + # series only if self._is_series: + + # stack blocks if self.axis == 0: - new_data = com._concat_compat([x.get_values() for x in self.objs]) + new_data = com._concat_compat([x.values for x in self.objs]) name = com._consensus_name_attr(self.objs) return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') + + # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes @@ -866,6 +872,8 @@ def get_result(self): if columns is not None: tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') + + # combine block managers else: mgrs_indexers = [] for obj in self.objs: diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 8f375ca168edd..c942998d430f4 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -2056,6 +2056,7 @@ def test_panel4d_concat_mixed_type(self): tm.assert_panel4d_equal(result, expected) def test_concat_series(self): + ts = tm.makeTimeSeries() ts.name = 'foo' diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 227af42f07411..f12e0263bcf0c 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -5,6 +5,9 @@ from pandas.core import common as com from pandas import Series, DatetimeIndex, PeriodIndex, TimedeltaIndex from pandas import lib, tslib +from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, + is_datetime_arraylike, is_integer_dtype, is_list_like, + get_dtype_kinds) def is_datetimelike(data): """ return a boolean if we can be successfully converted to a datetimelike """ @@ -42,9 +45,9 @@ def maybe_to_datetimelike(data, copy=False): elif issubclass(data.dtype.type, np.timedelta64): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index) else: - if com.is_period_arraylike(data): + if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index) - if com.is_datetime_arraylike(data): + if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index) raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) @@ -60,9 +63,9 @@ def _delegate_property_get(self, name): # maybe need to upcast (ints) if isinstance(result, np.ndarray): - if com.is_integer_dtype(result): + if is_integer_dtype(result): result = result.astype('int64') - elif not com.is_list_like(result): + elif not is_list_like(result): return result # return the result as a Series, which is by definition a copy @@ -162,3 +165,50 @@ class PeriodProperties(Properties): PeriodProperties._add_delegate_accessors(delegate=PeriodIndex, accessors=PeriodIndex._datetimelike_ops, typ='property') + +def _concat_compat(to_concat, axis=0): + """ + provide concatenation of an datetimelike array of arrays each of which is a single + M8[ns], or m8[ns] dtype + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + + Returns + ------- + a single array, preserving the combined dtypes + """ + + def convert_to_pydatetime(x, axis): + # coerce to an object dtype + if x.dtype == _NS_DTYPE: + shape = x.shape + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) + x = x.reshape(shape) + elif x.dtype == _TD_DTYPE: + shape = x.shape + x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) + x = x.reshape(shape) + return x + + typs = get_dtype_kinds(to_concat) + + # single dtype + if len(typs) == 1: + + if not len(typs-set(['datetime'])): + new_values = np.concatenate([x.view(np.int64) for x in to_concat], + axis=axis) + return new_values.view(_NS_DTYPE) + + elif not len(typs-set(['timedelta'])): + new_values = np.concatenate([x.view(np.int64) for x in to_concat], + axis=axis) + return new_values.view(_TD_DTYPE) + + # need to coerce to object + to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] + + return np.concatenate(to_concat,axis=axis)