BUG: fix issue with sparse concatting

This was originally brought up in :issue:`18686` and :issue:`18914`. Basically the problem is when you use get_dummies with sparse=True it will return a SparseDataFrame with sparse and dense columns. This is in fact not what we want. What we want is a DataFrame with sparse and dense columns. Inside of pandas.core.dtypes.concat is a function that defines the factory class which needed to be changed.
pandas-dev · Jan 1, 2018 · 2ef4ed1 · 2ef4ed1
1 parent c19bdc9
commit 2ef4ed1
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 18 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -363,6 +363,7 @@ Reshaping
 - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
 - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
 - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string.  The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)
+- Bug in :func:`concat` when concatting sparse and dense series it returns only a SparseDataFrame. Should be a DataFrame. (:issue:`18914`, :issue:`18686`, and :issue:`16874`)
 
 
 Numeric

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -89,10 +89,10 @@ def _get_series_result_type(result, objs=None):
 def _get_frame_result_type(result, objs):
     """
     return appropriate class of DataFrame-like concat
-    if any block is SparseBlock, return SparseDataFrame
+    if all blocks are SparseBlock, return SparseDataFrame
     otherwise, return 1st obj
     """
-    if any(b.is_sparse for b in result.blocks):
+    if result.blocks and all(b.is_sparse for b in result.blocks):
         from pandas.core.sparse.api import SparseDataFrame
         return SparseDataFrame
     else:

diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
@@ -168,7 +168,6 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
                 if index is None:
                     index = data.index.view()
                 else:
-
                     data = data.reindex(index, copy=False)
 
             else:

diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py
@@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
 
             tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize('sparse', [True, False])
+    def test_get_dummies_dont_sparsify_all_columns(self, sparse):
+        # GH18914
+        df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])])
+        df = get_dummies(df, columns=['Nation'], sparse=sparse)
+        df2 = df.reindex(columns=['GDP'])
+
+        tm.assert_frame_equal(df[['GDP']], df2)
+
 
 class TestCategoricalReshape(object):
 

diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py
@@ -1,4 +1,5 @@
 # pylint: disable-msg=E1101,W0612
+import pytest
 
 import numpy as np
 import pandas as pd
@@ -317,37 +318,35 @@ def test_concat_axis1(self):
         assert isinstance(res, pd.SparseDataFrame)
         tm.assert_frame_equal(res.to_dense(), exp)
 
-    def test_concat_sparse_dense(self):
-        sparse = self.dense1.to_sparse()
 
+    @pytest.mark.parametrize('fill_value', [None, 0])
+    def test_concat_sparse_dense(self, fill_value):
+        sparse = self.dense1.to_sparse(fill_value=fill_value)
         res = pd.concat([sparse, self.dense2])
         exp = pd.concat([self.dense1, self.dense2])
-        assert isinstance(res, pd.SparseDataFrame)
-        tm.assert_frame_equal(res.to_dense(), exp)
 
-        res = pd.concat([self.dense2, sparse])
-        exp = pd.concat([self.dense2, self.dense1])
-        assert isinstance(res, pd.SparseDataFrame)
-        tm.assert_frame_equal(res.to_dense(), exp)
-
-        sparse = self.dense1.to_sparse(fill_value=0)
-
-        res = pd.concat([sparse, self.dense2])
-        exp = pd.concat([self.dense1, self.dense2])
         assert isinstance(res, pd.SparseDataFrame)
         tm.assert_frame_equal(res.to_dense(), exp)
 
         res = pd.concat([self.dense2, sparse])
         exp = pd.concat([self.dense2, self.dense1])
+
         assert isinstance(res, pd.SparseDataFrame)
         tm.assert_frame_equal(res.to_dense(), exp)
 
         res = pd.concat([self.dense3, sparse], axis=1)
         exp = pd.concat([self.dense3, self.dense1], axis=1)
-        assert isinstance(res, pd.SparseDataFrame)
+        # See GH18914 and #18686 for why this should be 
+        # A DataFrame
+        assert isinstance(res, pd.DataFrame)
+        for column in self.dense3.columns:
+            tm.assert_series_equal(res[column], exp[column])
+
         tm.assert_frame_equal(res, exp)
 
         res = pd.concat([sparse, self.dense3], axis=1)
         exp = pd.concat([self.dense1, self.dense3], axis=1)
-        assert isinstance(res, pd.SparseDataFrame)
+        assert isinstance(res, pd.DataFrame)
+        for column in self.dense3.columns:
+            tm.assert_series_equal(res[column], exp[column])
         tm.assert_frame_equal(res, exp)