diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5fd7c3e2179289..1ceb8b3e87c71c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -363,6 +363,7 @@ Reshaping - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`) - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) +- Bug in :func:`concat` when concatting sparse and dense series it returns only a SparseDataFrame. Should be a DataFrame. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) Numeric diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index cd98064dee86e7..956a209290411d 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -89,10 +89,10 @@ def _get_series_result_type(result, objs=None): def _get_frame_result_type(result, objs): """ return appropriate class of DataFrame-like concat - if any block is SparseBlock, return SparseDataFrame + if all blocks are SparseBlock, return SparseDataFrame otherwise, return 1st obj """ - if any(b.is_sparse for b in result.blocks): + if result.blocks and all(b.is_sparse for b in result.blocks): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 8a38b1054a1f5c..09b6eb1c100e60 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -168,7 +168,6 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if index is None: index = data.index.view() else: - data = data.reindex(index, copy=False) else: diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 22925cceb30d11..c9d079421532f8 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('sparse', [True, False]) + def test_get_dummies_dont_sparsify_all_columns(self, sparse): + # GH18914 + df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])]) + df = get_dummies(df, columns=['Nation'], sparse=sparse) + df2 = df.reindex(columns=['GDP']) + + tm.assert_frame_equal(df[['GDP']], df2) + class TestCategoricalReshape(object): diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 15639fbe156c6b..4ccccbe0a4f318 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -1,4 +1,5 @@ # pylint: disable-msg=E1101,W0612 +import pytest import numpy as np import pandas as pd @@ -317,37 +318,35 @@ def test_concat_axis1(self): assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) - def test_concat_sparse_dense(self): - sparse = self.dense1.to_sparse() + @pytest.mark.parametrize('fill_value', [None, 0]) + def test_concat_sparse_dense(self, fill_value): + sparse = self.dense1.to_sparse(fill_value=fill_value) res = pd.concat([sparse, self.dense2]) exp = pd.concat([self.dense1, self.dense2]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - res = pd.concat([self.dense2, sparse]) - exp = pd.concat([self.dense2, self.dense1]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - sparse = self.dense1.to_sparse(fill_value=0) - - res = pd.concat([sparse, self.dense2]) - exp = pd.concat([self.dense1, self.dense2]) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) res = pd.concat([self.dense2, sparse]) exp = pd.concat([self.dense2, self.dense1]) + assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) res = pd.concat([self.dense3, sparse], axis=1) exp = pd.concat([self.dense3, self.dense1], axis=1) - assert isinstance(res, pd.SparseDataFrame) + # See GH18914 and #18686 for why this should be + # A DataFrame + assert isinstance(res, pd.DataFrame) + for column in self.dense3.columns: + tm.assert_series_equal(res[column], exp[column]) + tm.assert_frame_equal(res, exp) res = pd.concat([sparse, self.dense3], axis=1) exp = pd.concat([self.dense1, self.dense3], axis=1) - assert isinstance(res, pd.SparseDataFrame) + assert isinstance(res, pd.DataFrame) + for column in self.dense3.columns: + tm.assert_series_equal(res[column], exp[column]) tm.assert_frame_equal(res, exp)