Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: fix issue with concat creating SparseFrame if not all series are sparse. #18924

Merged
merged 13 commits into from
Feb 1, 2018
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,7 @@ Reshaping
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
- Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`)
-


Expand Down
13 changes: 8 additions & 5 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
_TD_DTYPE)
from pandas.core.dtypes.generic import (
ABCDatetimeIndex, ABCTimedeltaIndex,
ABCPeriodIndex, ABCRangeIndex)
ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame)


def get_dtype_kinds(l):
Expand Down Expand Up @@ -89,14 +89,17 @@ def _get_series_result_type(result, objs=None):
def _get_frame_result_type(result, objs):
"""
return appropriate class of DataFrame-like concat
if any block is SparseBlock, return SparseDataFrame
if all blocks are SparseBlock, return SparseDataFrame
otherwise, return 1st obj
"""
if any(b.is_sparse for b in result.blocks):
from pandas.core.sparse.api import SparseDataFrame

from pandas.core.sparse.api import SparseDataFrame
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could put this right before you actually use SparseDataFrame


if result.blocks and all(b.is_sparse for b in result.blocks):
return SparseDataFrame
else:
return objs[0]
return next(obj for obj in objs if not isinstance(obj,
ABCSparseDataFrame))


def _concat_compat(to_concat, axis=0):
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/dtypes/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def _check(cls, inst):

ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", ))
ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", ))
ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp",
("sparse_frame", ))
ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",))
ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp",
('sparse_series',
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/dtypes/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class TestABCClasses(object):
df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index)
sparse_series = pd.Series([1, 2, 3]).to_sparse()
sparse_array = pd.SparseArray(np.random.randn(10))
sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]})

def test_abc_types(self):
assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex)
Expand All @@ -37,6 +38,7 @@ def test_abc_types(self):
assert isinstance(self.df.to_panel(), gt.ABCPanel)
assert isinstance(self.sparse_series, gt.ABCSparseSeries)
assert isinstance(self.sparse_array, gt.ABCSparseArray)
assert isinstance(self.sparse_frame, gt.ABCSparseDataFrame)
assert isinstance(self.categorical, gt.ABCCategorical)
assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):

tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('sparse', [True, False])
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
# GH18914
df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])])
df = get_dummies(df, columns=['Nation'], sparse=sparse)
df2 = df.reindex(columns=['GDP'])

tm.assert_frame_equal(df[['GDP']], df2)


class TestCategoricalReshape(object):

Expand Down
89 changes: 55 additions & 34 deletions pandas/tests/sparse/test_combine_concat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# pylint: disable-msg=E1101,W0612
import pytest

import numpy as np
import pandas as pd
import pandas.util.testing as tm
import itertools


class TestSparseSeriesConcat(object):
Expand Down Expand Up @@ -317,37 +319,56 @@ def test_concat_axis1(self):
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

def test_concat_sparse_dense(self):
sparse = self.dense1.to_sparse()

res = pd.concat([sparse, self.dense2])
exp = pd.concat([self.dense1, self.dense2])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense2, sparse])
exp = pd.concat([self.dense2, self.dense1])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

sparse = self.dense1.to_sparse(fill_value=0)

res = pd.concat([sparse, self.dense2])
exp = pd.concat([self.dense1, self.dense2])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense2, sparse])
exp = pd.concat([self.dense2, self.dense1])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense3, sparse], axis=1)
exp = pd.concat([self.dense3, self.dense1], axis=1)
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res, exp)

res = pd.concat([sparse, self.dense3], axis=1)
exp = pd.concat([self.dense1, self.dense3], axis=1)
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res, exp)
@pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
itertools.product([None, 0, 1, np.nan],
[0, 1],
[1, 0]))
def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx):
frames = [self.dense1, self.dense2]
sparse_frame = [frames[dense_idx],
frames[sparse_idx].to_sparse(fill_value=fill_value)]
dense_frame = [frames[dense_idx], frames[sparse_idx]]

for _ in range(2):
res = pd.concat(sparse_frame)
exp = pd.concat(dense_frame)

assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment here (and in test below), the purposes of the reverses & the loop

sparse_frame = sparse_frame[::-1]
dense_frame = dense_frame[::-1]

@pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
itertools.product([None, 0, 1, np.nan],
[0, 1],
[1, 0]))
def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
frames = [self.dense1, self.dense3]

sparse_frame = [frames[dense_idx],
frames[sparse_idx].to_sparse(fill_value=fill_value)]
dense_frame = [frames[dense_idx], frames[sparse_idx]]

for _ in range(2):
res = pd.concat(sparse_frame, axis=1)
exp = pd.concat(dense_frame, axis=1)

# See GH18914 and #18686 for why this should be
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove these asserts (357-364), the assert on the frame/series is comprehensive. you can not the issue numbers at the top of the test instead

# A DataFrame
assert type(res) is pd.DataFrame
# See GH16874
assert not res.isnull().empty
assert not res[res.columns[0]].empty
assert res.iloc[0, 0] == exp.iloc[0, 0]

for column in frames[dense_idx].columns:
if dense_idx == sparse_idx:
tm.assert_frame_equal(res[column], exp[column])
else:
tm.assert_series_equal(res[column], exp[column])

tm.assert_frame_equal(res, exp)

sparse_frame = sparse_frame[::-1]
dense_frame = dense_frame[::-1]