Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: fix issue with concat creating SparseFrame if not all series are sparse. #18924

Merged
merged 13 commits into from
Feb 1, 2018
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,7 @@ Reshaping
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
- Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`)
-


Expand Down
10 changes: 6 additions & 4 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
_TD_DTYPE)
from pandas.core.dtypes.generic import (
ABCDatetimeIndex, ABCTimedeltaIndex,
ABCPeriodIndex, ABCRangeIndex)
ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame)


def get_dtype_kinds(l):
Expand Down Expand Up @@ -89,14 +89,16 @@ def _get_series_result_type(result, objs=None):
def _get_frame_result_type(result, objs):
"""
return appropriate class of DataFrame-like concat
if any block is SparseBlock, return SparseDataFrame
if all blocks are SparseBlock, return SparseDataFrame
otherwise, return 1st obj
"""
if any(b.is_sparse for b in result.blocks):

if result.blocks and all(b.is_sparse for b in result.blocks):
from pandas.core.sparse.api import SparseDataFrame
return SparseDataFrame
else:
return objs[0]
return next(obj for obj in objs if not isinstance(obj,
ABCSparseDataFrame))


def _concat_compat(to_concat, axis=0):
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/dtypes/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def _check(cls, inst):

ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", ))
ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", ))
ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp",
("sparse_frame", ))
ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",))
ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp",
('sparse_series',
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/dtypes/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class TestABCClasses(object):
df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index)
sparse_series = pd.Series([1, 2, 3]).to_sparse()
sparse_array = pd.SparseArray(np.random.randn(10))
sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]})

def test_abc_types(self):
assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex)
Expand All @@ -37,6 +38,7 @@ def test_abc_types(self):
assert isinstance(self.df.to_panel(), gt.ABCPanel)
assert isinstance(self.sparse_series, gt.ABCSparseSeries)
assert isinstance(self.sparse_array, gt.ABCSparseArray)
assert isinstance(self.sparse_frame, gt.ABCSparseDataFrame)
assert isinstance(self.categorical, gt.ABCCategorical)
assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):

tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('sparse', [True, False])
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
# GH18914
df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])])
df = get_dummies(df, columns=['Nation'], sparse=sparse)
df2 = df.reindex(columns=['GDP'])

tm.assert_frame_equal(df[['GDP']], df2)


class TestCategoricalReshape(object):

Expand Down
85 changes: 51 additions & 34 deletions pandas/tests/sparse/test_combine_concat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# pylint: disable-msg=E1101,W0612
import pytest

import numpy as np
import pandas as pd
import pandas.util.testing as tm
import itertools


class TestSparseSeriesConcat(object):
Expand Down Expand Up @@ -317,37 +319,52 @@ def test_concat_axis1(self):
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

def test_concat_sparse_dense(self):
sparse = self.dense1.to_sparse()

res = pd.concat([sparse, self.dense2])
exp = pd.concat([self.dense1, self.dense2])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense2, sparse])
exp = pd.concat([self.dense2, self.dense1])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

sparse = self.dense1.to_sparse(fill_value=0)

res = pd.concat([sparse, self.dense2])
exp = pd.concat([self.dense1, self.dense2])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense2, sparse])
exp = pd.concat([self.dense2, self.dense1])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense3, sparse], axis=1)
exp = pd.concat([self.dense3, self.dense1], axis=1)
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res, exp)

res = pd.concat([sparse, self.dense3], axis=1)
exp = pd.concat([self.dense1, self.dense3], axis=1)
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res, exp)
@pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
itertools.product([None, 0, 1, np.nan],
[0, 1],
[1, 0]))
def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx):
frames = [self.dense1, self.dense2]
sparse_frame = [frames[dense_idx],
frames[sparse_idx].to_sparse(fill_value=fill_value)]
dense_frame = [frames[dense_idx], frames[sparse_idx]]

# This will try both directions sparse + dense and dense + sparse
for _ in range(2):
res = pd.concat(sparse_frame)
exp = pd.concat(dense_frame)

assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment here (and in test below), the purposes of the reverses & the loop

sparse_frame = sparse_frame[::-1]
dense_frame = dense_frame[::-1]

@pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
itertools.product([None, 0, 1, np.nan],
[0, 1],
[1, 0]))
def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
# See GH16874, GH18914 and #18686 for why this should be a DataFrame

frames = [self.dense1, self.dense3]

sparse_frame = [frames[dense_idx],
frames[sparse_idx].to_sparse(fill_value=fill_value)]
dense_frame = [frames[dense_idx], frames[sparse_idx]]

# This will try both directions sparse + dense and dense + sparse
for _ in range(2):
res = pd.concat(sparse_frame, axis=1)
exp = pd.concat(dense_frame, axis=1)

for column in frames[dense_idx].columns:
if dense_idx == sparse_idx:
tm.assert_frame_equal(res[column], exp[column])
else:
tm.assert_series_equal(res[column], exp[column])

tm.assert_frame_equal(res, exp)

sparse_frame = sparse_frame[::-1]
dense_frame = dense_frame[::-1]