-
-
Notifications
You must be signed in to change notification settings - Fork 18k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REF/INT: concat blocks of same type with preserving block type #17728
Changes from 4 commits
007efb1
dd9babd
8c6f4a7
d4ce3df
7676f03
1c35aca
bd561d9
0c90561
1603f7d
d3a79e7
315f16b
66b5846
bb5a100
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -312,6 +312,14 @@ def ftype(self): | |
def merge(self, other): | ||
return _merge_blocks([self, other]) | ||
|
||
def concat_same_type(self, to_concat): | ||
""" | ||
Concatenate list of single blocks of the same type. | ||
""" | ||
values = np.concatenate([blk.values for blk in to_concat]) | ||
return self.make_block_same_class( | ||
values, placement=slice(0, len(values), 1)) | ||
|
||
def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, | ||
limit=None, mask_info=None): | ||
""" | ||
|
@@ -2407,6 +2415,19 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): | |
# we are expected to return a 2-d ndarray | ||
return values.reshape(1, len(values)) | ||
|
||
def concat_same_type(self, to_concat): | ||
""" | ||
Concatenate list of single blocks of the same type. | ||
""" | ||
to_concat = [blk.values for blk in to_concat] | ||
values = _concat._concat_categorical(to_concat) | ||
|
||
if is_categorical_dtype(values.dtype): | ||
return self.make_block_same_class( | ||
values, placement=slice(0, len(values), 1)) | ||
else: | ||
return make_block(values, placement=slice(0, len(values), 1)) | ||
|
||
|
||
class DatetimeBlock(DatetimeLikeBlockMixin, Block): | ||
__slots__ = () | ||
|
@@ -2684,6 +2705,19 @@ def shift(self, periods, axis=0, mgr=None): | |
return [self.make_block_same_class(new_values, | ||
placement=self.mgr_locs)] | ||
|
||
def concat_same_type(self, to_concat): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should prob make a dispatch function in
|
||
""" | ||
Concatenate list of single blocks of the same type. | ||
""" | ||
to_concat = [blk.values for blk in to_concat] | ||
values = _concat._concat_datetime(to_concat) | ||
|
||
if is_datetimetz(values): | ||
return self.make_block_same_class( | ||
values, placement=slice(0, len(values), 1)) | ||
else: | ||
return make_block(values, placement=slice(0, len(values), 1)) | ||
|
||
|
||
class SparseBlock(NonConsolidatableMixIn, Block): | ||
""" implement as a list of sparse arrays of the same dtype """ | ||
|
@@ -2851,6 +2885,16 @@ def sparse_reindex(self, new_index): | |
return self.make_block_same_class(values, sparse_index=new_index, | ||
placement=self.mgr_locs) | ||
|
||
def concat_same_type(self, to_concat): | ||
""" | ||
Concatenate list of single blocks of the same type. | ||
""" | ||
to_concat = [blk.values for blk in to_concat] | ||
values = _concat._concat_sparse(to_concat) | ||
|
||
return self.make_block_same_class( | ||
values, placement=slice(0, len(values), 1)) | ||
|
||
|
||
def make_block(values, placement, klass=None, ndim=None, dtype=None, | ||
fastpath=False): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,13 @@ | |
""" | ||
|
||
import numpy as np | ||
from pandas import compat, DataFrame, Series, Index, MultiIndex | ||
from pandas import compat, DataFrame, Series, Index, MultiIndex, SparseSeries | ||
from pandas.core.index import (_get_objs_combined_axis, | ||
_ensure_index, _get_consensus_names, | ||
_all_indexes_same) | ||
from pandas.core.categorical import (_factorize_from_iterable, | ||
_factorize_from_iterables) | ||
from pandas.core.internals import concatenate_block_managers | ||
from pandas.core.internals import concatenate_block_managers, SparseBlock | ||
from pandas.core import common as com | ||
from pandas.core.generic import NDFrame | ||
import pandas.core.dtypes.concat as _concat | ||
|
@@ -362,15 +362,30 @@ def get_result(self): | |
|
||
# stack blocks | ||
if self.axis == 0: | ||
name = com._consensus_name_attr(self.objs) | ||
|
||
# concat Series with length to keep dtype as much | ||
non_empties = [x for x in self.objs if len(x) > 0] | ||
|
||
# check if all series are of the same block type: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the purpose of concat_compat, why are you re-writing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah we just pushed the sparse handling for this down into the blocks to make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right seem my comment. This would be ideal to actually do inside the BlockManger (not objecting to the actual code, more of its location). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For series this would be SingleBlockManager then? For me that would be OK to move this there, but I in the SingleBlockManager.concat, I would still need to dispatch to the actual Blocks (as I now have the methods on the individual blocks), as my original goal is enabling external blocks to override how their values are concatenated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure that's fine, your code would be pretty similar. the idea is you just have a Block.concat that works for mostl cases and case override as needed (e.g. in your custom block). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, the sparse unstack PR is indeed a good example for this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I did an attempt to move the logic inside SingleBlockManager, see last commit. Not fully sure I find it cleaner (as eg the new_axis is passed into the function, as that logic is currently quite ingrained into the |
||
if len(non_empties) > 0: | ||
blocks = [obj._data.blocks[0] for obj in non_empties] | ||
if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa | ||
new_block = blocks[0].concat_same_type(blocks) | ||
if isinstance(new_block, SparseBlock): | ||
cons = SparseSeries | ||
else: | ||
cons = Series | ||
return (cons(new_block, index=self.new_axes[0], | ||
name=name, fastpath=True) | ||
.__finalize__(self, method='concat')) | ||
|
||
if len(non_empties) > 0: | ||
values = [x._values for x in non_empties] | ||
else: | ||
values = [x._values for x in self.objs] | ||
new_data = _concat._concat_compat(values) | ||
|
||
name = com._consensus_name_attr(self.objs) | ||
cons = _concat._get_series_result_type(new_data) | ||
|
||
return (cons(new_data, index=self.new_axes[0], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,3 +27,12 @@ def test_custom_repr(): | |
blk_mgr = BlockManager([block], [['col'], range(3)]) | ||
df = pd.DataFrame(blk_mgr) | ||
assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2' | ||
|
||
|
||
def test_concat_series(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the issue number here |
||
values = np.arange(3, dtype='int64') | ||
block = CustomBlock(values, placement=slice(0, 3)) | ||
s = pd.Series(block, pd.RangeIndex(3), fastpath=True) | ||
|
||
res = pd.concat([s, s]) | ||
assert isinstance(res._data.blocks[0], CustomBlock) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you could dispense with all of these if/thens and just use make_block
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, I could, but I wanted to make advantage of the fact that for those case it is categorical, I don't need to re-infer that.
Not sure how important that is for the internal ones though, as the if/else statements in make_block will not give that much of overhead (I added the
concat_same_type
machinery to be able to do that in an external Block, but that does not mean that I necessarily need to do it internally).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its just added code and more complexity. Trying reduce code here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I removed this if/else check in the categorical and datetimetz block's concat_same_type.
But if you want, I can also completely remove all overwriting of
Block.concat_same_type
and use a generic one in the base Block type (using_concat._concat_compat
andmake_block
instead ofnp.concatenate
andself.make_block_same_class
). And in this way only keep the ability of external block types (like geopandas) to overwrite this, and without using that ability internally. But, doing that would remove the performance improvement I mentioned above (which is not the main objective of this PR, so not too bad, but having it is still nice).