-
-
Notifications
You must be signed in to change notification settings - Fork 18k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REF/INT: concat blocks of same type with preserving block type #17728
Changes from 11 commits
007efb1
dd9babd
8c6f4a7
d4ce3df
7676f03
1c35aca
bd561d9
0c90561
1603f7d
d3a79e7
315f16b
66b5846
bb5a100
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -314,6 +314,15 @@ def ftype(self): | |
def merge(self, other): | ||
return _merge_blocks([self, other]) | ||
|
||
def concat_same_type(self, to_concat, placement=None): | ||
""" | ||
Concatenate list of single blocks of the same type. | ||
""" | ||
values = np.concatenate([blk.values for blk in to_concat], | ||
axis=self.ndim - 1) | ||
return self.make_block_same_class( | ||
values, placement=placement or slice(0, len(values), 1)) | ||
|
||
def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, | ||
limit=None, mask_info=None): | ||
""" | ||
|
@@ -2432,6 +2441,18 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): | |
# we are expected to return a 2-d ndarray | ||
return values.reshape(1, len(values)) | ||
|
||
def concat_same_type(self, to_concat, placement=None): | ||
""" | ||
Concatenate list of single blocks of the same type. | ||
""" | ||
to_concat = [blk.values for blk in to_concat] | ||
values = _concat._concat_categorical(to_concat, axis=self.ndim - 1) | ||
|
||
# not using self.make_block_same_class as values can be object dtype | ||
return make_block( | ||
values, placement=placement or slice(0, len(values), 1), | ||
ndim=self.ndim) | ||
|
||
|
||
class DatetimeBlock(DatetimeLikeBlockMixin, Block): | ||
__slots__ = () | ||
|
@@ -2711,6 +2732,17 @@ def shift(self, periods, axis=0, mgr=None): | |
return [self.make_block_same_class(new_values, | ||
placement=self.mgr_locs)] | ||
|
||
def concat_same_type(self, to_concat, placement=None): | ||
""" | ||
Concatenate list of single blocks of the same type. | ||
""" | ||
to_concat = [blk.values for blk in to_concat] | ||
values = _concat._concat_datetime(to_concat, axis=self.ndim - 1) | ||
|
||
# not using self.make_block_same_class as values can be non-tz dtype | ||
return make_block( | ||
values, placement=placement or slice(0, len(values), 1)) | ||
|
||
|
||
class SparseBlock(NonConsolidatableMixIn, Block): | ||
""" implement as a list of sparse arrays of the same dtype """ | ||
|
@@ -2878,6 +2910,16 @@ def sparse_reindex(self, new_index): | |
return self.make_block_same_class(values, sparse_index=new_index, | ||
placement=self.mgr_locs) | ||
|
||
def concat_same_type(self, to_concat, placement=None): | ||
""" | ||
Concatenate list of single blocks of the same type. | ||
""" | ||
to_concat = [blk.values for blk in to_concat] | ||
values = _concat._concat_sparse(to_concat) | ||
|
||
return self.make_block_same_class( | ||
values, placement=placement or slice(0, len(values), 1)) | ||
|
||
|
||
def make_block(values, placement, klass=None, ndim=None, dtype=None, | ||
fastpath=False): | ||
|
@@ -4517,6 +4559,45 @@ def fast_xs(self, loc): | |
""" | ||
return self._block.values[loc] | ||
|
||
def concat(self, to_concat, new_axis): | ||
""" | ||
Concatenate a list of SingleBlockManagers into a single | ||
SingleBlockManager. | ||
|
||
Used for pd.concat of Series objects with axis=0. | ||
|
||
Parameters | ||
---------- | ||
to_concat : list of SingleBlockManagers | ||
new_axis : Index of the result | ||
|
||
Returns | ||
------- | ||
SingleBlockManager | ||
|
||
""" | ||
non_empties = [x for x in to_concat if len(x) > 0] | ||
|
||
# check if all series are of the same block type: | ||
if len(non_empties) > 0: | ||
blocks = [obj.blocks[0] for obj in non_empties] | ||
|
||
if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could some of this logic be moreve to concaty_same_type? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which logic do you mean exactly? |
||
new_block = blocks[0].concat_same_type(blocks) | ||
else: | ||
values = [x.values for x in blocks] | ||
values = _concat._concat_compat(values) | ||
new_block = make_block( | ||
values, placement=slice(0, len(values), 1)) | ||
else: | ||
values = [x._block.values for x in to_concat] | ||
values = _concat._concat_compat(values) | ||
new_block = make_block( | ||
values, placement=slice(0, len(values), 1)) | ||
|
||
mgr = SingleBlockManager(new_block, new_axis) | ||
return mgr | ||
|
||
|
||
def construction_error(tot_items, block_shape, axes, e=None): | ||
""" raise a helpful message about our construction """ | ||
|
@@ -5105,13 +5186,42 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): | |
[get_mgr_concatenation_plan(mgr, indexers) | ||
for mgr, indexers in mgrs_indexers], concat_axis) | ||
|
||
blocks = [make_block( | ||
concatenate_join_units(join_units, concat_axis, copy=copy), | ||
placement=placement) for placement, join_units in concat_plan] | ||
blocks = [] | ||
|
||
for placement, join_units in concat_plan: | ||
|
||
if is_uniform_join_units(join_units): | ||
b = join_units[0].block.concat_same_type( | ||
[ju.block for ju in join_units], placement=placement) | ||
else: | ||
b = make_block( | ||
concatenate_join_units(join_units, concat_axis, copy=copy), | ||
placement=placement) | ||
blocks.append(b) | ||
|
||
return BlockManager(blocks, axes) | ||
|
||
|
||
def is_uniform_join_units(join_units): | ||
""" | ||
Check if the join units consist of blocks of uniform type that can | ||
be concatenated using Block.concat_same_type instead of the generic | ||
concatenate_join_units (which uses `_concat._concat_compat`). | ||
|
||
""" | ||
return ( | ||
# all blocks need to have the same type | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a mouthful! can you put blank lines in between statements |
||
all([type(ju.block) is type(join_units[0].block) for ju in join_units]) and # noqa | ||
# no blocks that would get missing values (can lead to type upcasts) | ||
all([not ju.is_na for ju in join_units]) and | ||
# no blocks with indexers (as then the dimensions do not fit) | ||
all([not ju.indexers for ju in join_units]) and | ||
# disregard Panels | ||
all([ju.block.ndim <= 2 for ju in join_units]) and | ||
# only use this path when there is something to concatenate | ||
len(join_units) > 1) | ||
|
||
|
||
def get_empty_dtype_and_na(join_units): | ||
""" | ||
Return dtype and N/A values to use when concatenating specified units. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,14 +4,26 @@ | |
import numpy as np | ||
|
||
import pandas as pd | ||
from pandas.core.internals import Block, BlockManager, SingleBlockManager | ||
from pandas.core.internals import ( | ||
Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn) | ||
|
||
|
||
class CustomBlock(Block): | ||
class CustomBlock(NonConsolidatableMixIn, Block): | ||
|
||
_holder = np.ndarray | ||
|
||
def formatting_values(self): | ||
return np.array(["Val: {}".format(i) for i in self.values]) | ||
|
||
def concat_same_type(self, to_concat, placement=None): | ||
""" | ||
Always concatenate disregarding self.ndim as the values are | ||
always 1D in this custom Block | ||
""" | ||
values = np.concatenate([blk.values for blk in to_concat]) | ||
return self.make_block_same_class( | ||
values, placement=placement or slice(0, len(values), 1)) | ||
|
||
|
||
def test_custom_repr(): | ||
values = np.arange(3, dtype='int64') | ||
|
@@ -23,7 +35,30 @@ def test_custom_repr(): | |
assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64' | ||
|
||
# dataframe | ||
block = CustomBlock(values.reshape(1, -1), placement=slice(0, 1)) | ||
block = CustomBlock(values, placement=slice(0, 1)) | ||
blk_mgr = BlockManager([block], [['col'], range(3)]) | ||
df = pd.DataFrame(blk_mgr) | ||
assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2' | ||
|
||
|
||
def test_concat_series(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the issue number here |
||
# GH17728 | ||
values = np.arange(3, dtype='int64') | ||
block = CustomBlock(values, placement=slice(0, 3)) | ||
s = pd.Series(block, pd.RangeIndex(3), fastpath=True) | ||
|
||
res = pd.concat([s, s]) | ||
assert isinstance(res._data.blocks[0], CustomBlock) | ||
|
||
|
||
def test_concat_dataframe(): | ||
# GH17728 | ||
df = pd.DataFrame({'a': [1, 2, 3]}) | ||
blocks = df._data.blocks | ||
values = np.arange(3, dtype='int64') | ||
custom_block = CustomBlock(values, placement=slice(1, 2)) | ||
blocks = blocks + (custom_block, ) | ||
block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df.index]) | ||
df = pd.DataFrame(block_manager) | ||
res = pd.concat([df, df]) | ||
assert isinstance(res._data.blocks[1], CustomBlock) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
actually you could combine all of these
concat_same_type
into a single routine if you did thisThen add to Block
Categorical
etc
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, that would actually be nice. The only problem with this is that (for the reasons that I had the if/else statements originally)
self.make_block_same_class
will not always work. Eg_concat_categorical
can return both categorical values as object values, and depending on that should return a CategoricalBlock or another type of Block.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so for categorical block override; otherwise u end up repeating lots of code
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but need to overwrite for Datetimetz as well, so then end up with almost as many overridden ones as now (only for Sparse it would not be needed then).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is still repeatting way too much code. you can just do it this way
in Block
then where needed
that way for an overriden class you are not repeating evertyhing.