Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF/INT: concat blocks of same type with preserving block type #17728

Merged
merged 13 commits into from
Oct 12, 2017
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,14 @@ def ftype(self):
def merge(self, other):
return _merge_blocks([self, other])

def concat_same_type(self, to_concat):
"""
Concatenate list of single blocks of the same type.
"""
values = np.concatenate([blk.values for blk in to_concat])
return self.make_block_same_class(
values, placement=slice(0, len(values), 1))

def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
limit=None, mask_info=None):
"""
Expand Down Expand Up @@ -2407,6 +2415,19 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
# we are expected to return a 2-d ndarray
return values.reshape(1, len(values))

def concat_same_type(self, to_concat):
"""
Concatenate list of single blocks of the same type.
"""
to_concat = [blk.values for blk in to_concat]
values = _concat._concat_categorical(to_concat)

if is_categorical_dtype(values.dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you could dispense with all of these if/thens and just use make_block

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I could, but I wanted to make advantage of the fact that for those case it is categorical, I don't need to re-infer that.
Not sure how important that is for the internal ones though, as the if/else statements in make_block will not give that much of overhead (I added the concat_same_type machinery to be able to do that in an external Block, but that does not mean that I necessarily need to do it internally).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its just added code and more complexity. Trying reduce code here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I removed this if/else check in the categorical and datetimetz block's concat_same_type.

But if you want, I can also completely remove all overwriting of Block.concat_same_type and use a generic one in the base Block type (using _concat._concat_compat and make_block instead of np.concatenate and self.make_block_same_class). And in this way only keep the ability of external block types (like geopandas) to overwrite this, and without using that ability internally. But, doing that would remove the performance improvement I mentioned above (which is not the main objective of this PR, so not too bad, but having it is still nice).

return self.make_block_same_class(
values, placement=slice(0, len(values), 1))
else:
return make_block(values, placement=slice(0, len(values), 1))


class DatetimeBlock(DatetimeLikeBlockMixin, Block):
__slots__ = ()
Expand Down Expand Up @@ -2684,6 +2705,19 @@ def shift(self, periods, axis=0, mgr=None):
return [self.make_block_same_class(new_values,
placement=self.mgr_locs)]

def concat_same_type(self, to_concat):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should prob make a dispatch function in _concat to avoid all of this boilerplate

def _concat_for_type(to_concat, typ='sparse|datetime|cat')

"""
Concatenate list of single blocks of the same type.
"""
to_concat = [blk.values for blk in to_concat]
values = _concat._concat_datetime(to_concat)

if is_datetimetz(values):
return self.make_block_same_class(
values, placement=slice(0, len(values), 1))
else:
return make_block(values, placement=slice(0, len(values), 1))


class SparseBlock(NonConsolidatableMixIn, Block):
""" implement as a list of sparse arrays of the same dtype """
Expand Down Expand Up @@ -2851,6 +2885,16 @@ def sparse_reindex(self, new_index):
return self.make_block_same_class(values, sparse_index=new_index,
placement=self.mgr_locs)

def concat_same_type(self, to_concat):
"""
Concatenate list of single blocks of the same type.
"""
to_concat = [blk.values for blk in to_concat]
values = _concat._concat_sparse(to_concat)

return self.make_block_same_class(
values, placement=slice(0, len(values), 1))


def make_block(values, placement, klass=None, ndim=None, dtype=None,
fastpath=False):
Expand Down
21 changes: 18 additions & 3 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
"""

import numpy as np
from pandas import compat, DataFrame, Series, Index, MultiIndex
from pandas import compat, DataFrame, Series, Index, MultiIndex, SparseSeries
from pandas.core.index import (_get_objs_combined_axis,
_ensure_index, _get_consensus_names,
_all_indexes_same)
from pandas.core.categorical import (_factorize_from_iterable,
_factorize_from_iterables)
from pandas.core.internals import concatenate_block_managers
from pandas.core.internals import concatenate_block_managers, SparseBlock
from pandas.core import common as com
from pandas.core.generic import NDFrame
import pandas.core.dtypes.concat as _concat
Expand Down Expand Up @@ -362,15 +362,30 @@ def get_result(self):

# stack blocks
if self.axis == 0:
name = com._consensus_name_attr(self.objs)

# concat Series with length to keep dtype as much
non_empties = [x for x in self.objs if len(x) > 0]

# check if all series are of the same block type:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the purpose of concat_compat, why are you re-writing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

concat_compat works on the level of values, here I want to work on the level of blocks. I could also try to do that in concat_compat, but I am not sure I can fully switch that to using blocks (eg for concatting index it still needs to work on values)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah we just pushed the sparse handling for this down into the blocks to make .unstack cleaner. Its not that hard actually. you would just have a BlockManager.concat which does the work for you (e.g. give it a list of other block managers).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right seem my comment. This would be ideal to actually do inside the BlockManger (not objecting to the actual code, more of its location).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For series this would be SingleBlockManager then?

For me that would be OK to move this there, but I in the SingleBlockManager.concat, I would still need to dispatch to the actual Blocks (as I now have the methods on the individual blocks), as my original goal is enabling external blocks to override how their values are concatenated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure that's fine, your code would be pretty similar. the idea is you just have a Block.concat that works for mostl cases and case override as needed (e.g. in your custom block).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, the sparse unstack PR is indeed a good example for this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I did an attempt to move the logic inside SingleBlockManager, see last commit. Not fully sure I find it cleaner (as eg the new_axis is passed into the function, as that logic is currently quite ingrained into the _Concatenator class to calculate the new axis values)

if len(non_empties) > 0:
blocks = [obj._data.blocks[0] for obj in non_empties]
if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa
new_block = blocks[0].concat_same_type(blocks)
if isinstance(new_block, SparseBlock):
cons = SparseSeries
else:
cons = Series
return (cons(new_block, index=self.new_axes[0],
name=name, fastpath=True)
.__finalize__(self, method='concat'))

if len(non_empties) > 0:
values = [x._values for x in non_empties]
else:
values = [x._values for x in self.objs]
new_data = _concat._concat_compat(values)

name = com._consensus_name_attr(self.objs)
cons = _concat._get_series_result_type(new_data)

return (cons(new_data, index=self.new_axes[0],
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/internals/test_external_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,12 @@ def test_custom_repr():
blk_mgr = BlockManager([block], [['col'], range(3)])
df = pd.DataFrame(blk_mgr)
assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2'


def test_concat_series():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number here

values = np.arange(3, dtype='int64')
block = CustomBlock(values, placement=slice(0, 3))
s = pd.Series(block, pd.RangeIndex(3), fastpath=True)

res = pd.concat([s, s])
assert isinstance(res._data.blocks[0], CustomBlock)