diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5db29cb76b106..6cbdc3be07f13 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -540,6 +540,7 @@ Reshaping - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) +- Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) - diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 3e54ce61cd5b2..ddecbe85087d8 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -19,7 +19,7 @@ _TD_DTYPE) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, ABCTimedeltaIndex, - ABCPeriodIndex, ABCRangeIndex) + ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame) def get_dtype_kinds(l): @@ -89,14 +89,16 @@ def _get_series_result_type(result, objs=None): def _get_frame_result_type(result, objs): """ return appropriate class of DataFrame-like concat - if any block is SparseBlock, return SparseDataFrame + if all blocks are SparseBlock, return SparseDataFrame otherwise, return 1st obj """ - if any(b.is_sparse for b in result.blocks): + + if result.blocks and all(b.is_sparse for b in result.blocks): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: - return objs[0] + return next(obj for obj in objs if not isinstance(obj, + ABCSparseDataFrame)) def _concat_compat(to_concat, axis=0): diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 6fae09c43d2be..b032cb6f14d4c 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -43,6 +43,8 @@ def _check(cls, inst): ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) +ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp", + ("sparse_frame", )) ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", ('sparse_series', diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 58cb182e7d403..53f92b98f022e 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -18,6 +18,7 @@ class TestABCClasses(object): df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index) sparse_series = pd.Series([1, 2, 3]).to_sparse() sparse_array = pd.SparseArray(np.random.randn(10)) + sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]}) def test_abc_types(self): assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex) @@ -37,6 +38,7 @@ def test_abc_types(self): assert isinstance(self.df.to_panel(), gt.ABCPanel) assert isinstance(self.sparse_series, gt.ABCSparseSeries) assert isinstance(self.sparse_array, gt.ABCSparseArray) + assert isinstance(self.sparse_frame, gt.ABCSparseDataFrame) assert isinstance(self.categorical, gt.ABCCategorical) assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 22925cceb30d1..c9d079421532f 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('sparse', [True, False]) + def test_get_dummies_dont_sparsify_all_columns(self, sparse): + # GH18914 + df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])]) + df = get_dummies(df, columns=['Nation'], sparse=sparse) + df2 = df.reindex(columns=['GDP']) + + tm.assert_frame_equal(df[['GDP']], df2) + class TestCategoricalReshape(object): diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 15639fbe156c6..70fd1da529d46 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -1,8 +1,10 @@ # pylint: disable-msg=E1101,W0612 +import pytest import numpy as np import pandas as pd import pandas.util.testing as tm +import itertools class TestSparseSeriesConcat(object): @@ -317,37 +319,52 @@ def test_concat_axis1(self): assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) - def test_concat_sparse_dense(self): - sparse = self.dense1.to_sparse() - - res = pd.concat([sparse, self.dense2]) - exp = pd.concat([self.dense1, self.dense2]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - res = pd.concat([self.dense2, sparse]) - exp = pd.concat([self.dense2, self.dense1]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - sparse = self.dense1.to_sparse(fill_value=0) - - res = pd.concat([sparse, self.dense2]) - exp = pd.concat([self.dense1, self.dense2]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - res = pd.concat([self.dense2, sparse]) - exp = pd.concat([self.dense2, self.dense1]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - res = pd.concat([self.dense3, sparse], axis=1) - exp = pd.concat([self.dense3, self.dense1], axis=1) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res, exp) - - res = pd.concat([sparse, self.dense3], axis=1) - exp = pd.concat([self.dense1, self.dense3], axis=1) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res, exp) + @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx', + itertools.product([None, 0, 1, np.nan], + [0, 1], + [1, 0])) + def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx): + frames = [self.dense1, self.dense2] + sparse_frame = [frames[dense_idx], + frames[sparse_idx].to_sparse(fill_value=fill_value)] + dense_frame = [frames[dense_idx], frames[sparse_idx]] + + # This will try both directions sparse + dense and dense + sparse + for _ in range(2): + res = pd.concat(sparse_frame) + exp = pd.concat(dense_frame) + + assert isinstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), exp) + + sparse_frame = sparse_frame[::-1] + dense_frame = dense_frame[::-1] + + @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx', + itertools.product([None, 0, 1, np.nan], + [0, 1], + [1, 0])) + def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx): + # See GH16874, GH18914 and #18686 for why this should be a DataFrame + + frames = [self.dense1, self.dense3] + + sparse_frame = [frames[dense_idx], + frames[sparse_idx].to_sparse(fill_value=fill_value)] + dense_frame = [frames[dense_idx], frames[sparse_idx]] + + # This will try both directions sparse + dense and dense + sparse + for _ in range(2): + res = pd.concat(sparse_frame, axis=1) + exp = pd.concat(dense_frame, axis=1) + + for column in frames[dense_idx].columns: + if dense_idx == sparse_idx: + tm.assert_frame_equal(res[column], exp[column]) + else: + tm.assert_series_equal(res[column], exp[column]) + + tm.assert_frame_equal(res, exp) + + sparse_frame = sparse_frame[::-1] + dense_frame = dense_frame[::-1]