Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF/INT: concat blocks of same type with preserving block type #17728

Merged
merged 13 commits into from
Oct 12, 2017
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,12 @@ def get_dtype_kinds(l):
return typs


def _get_series_result_type(result):
def _get_series_result_type(result, objs=None):
"""
return appropriate class of Series concat
input is either dict or array-like
"""
# concat Series with axis 1
if isinstance(result, dict):
# concat Series with axis 1
if all(is_sparse(c) for c in compat.itervalues(result)):
Expand All @@ -77,13 +78,12 @@ def _get_series_result_type(result):
from pandas.core.frame import DataFrame
return DataFrame

elif is_sparse(result):
# concat Series with axis 1
# otherwise it is a SingleBlockManager (axis = 0)
if result._block.is_sparse:
from pandas.core.sparse.api import SparseSeries
return SparseSeries
else:
from pandas.core.series import Series
return Series
return objs[0]._constructor


def _get_frame_result_type(result, objs):
Expand Down
122 changes: 119 additions & 3 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,15 @@ def ftype(self):
def merge(self, other):
return _merge_blocks([self, other])

def concat_same_type(self, to_concat, placement=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually you could combine all of these concat_same_type into a single routine if you did this

def concat_same_type(self, to_concat, placement=None)
       """
       Concatenate list of single blocks of the same type.
       """
        values = self._concatenator([blk.values for blk in to_concat], axis=self.ndim - 1)
        return self.make_block_same_class(
            values, placement=placement or slice(0, len(values), 1))

Then add to Block

_concatenator = np.concatenate

Categorical

_concatnator = _concat._concat_categorical

etc

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that would actually be nice. The only problem with this is that (for the reasons that I had the if/else statements originally) self.make_block_same_class will not always work. Eg _concat_categorical can return both categorical values as object values, and depending on that should return a CategoricalBlock or another type of Block.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so for categorical block override; otherwise u end up repeating lots of code

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but need to overwrite for Datetimetz as well, so then end up with almost as many overridden ones as now (only for Sparse it would not be needed then).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is still repeatting way too much code. you can just do it this way

in Block

def concat_same_type(self, to_concat, constructor=None, placement=None):
     values = self._concatenator(......)
    if constructor is None:
        constructor = make_block
      return constructor(....)

then where needed

def concat_same_type(.......):
      return super(Categorical, self).concat_same_type(....., constructor=self.make_block_same_class)

that way for an overriden class you are not repeating evertyhing.

"""
Concatenate list of single blocks of the same type.
"""
values = np.concatenate([blk.values for blk in to_concat],
axis=self.ndim - 1)
return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))

def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
limit=None, mask_info=None):
"""
Expand Down Expand Up @@ -2432,6 +2441,21 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
# we are expected to return a 2-d ndarray
return values.reshape(1, len(values))

def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
to_concat = [blk.values for blk in to_concat]
values = _concat._concat_categorical(to_concat, axis=self.ndim - 1)

if is_categorical_dtype(values.dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you could dispense with all of these if/thens and just use make_block

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I could, but I wanted to make advantage of the fact that for those case it is categorical, I don't need to re-infer that.
Not sure how important that is for the internal ones though, as the if/else statements in make_block will not give that much of overhead (I added the concat_same_type machinery to be able to do that in an external Block, but that does not mean that I necessarily need to do it internally).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its just added code and more complexity. Trying reduce code here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I removed this if/else check in the categorical and datetimetz block's concat_same_type.

But if you want, I can also completely remove all overwriting of Block.concat_same_type and use a generic one in the base Block type (using _concat._concat_compat and make_block instead of np.concatenate and self.make_block_same_class). And in this way only keep the ability of external block types (like geopandas) to overwrite this, and without using that ability internally. But, doing that would remove the performance improvement I mentioned above (which is not the main objective of this PR, so not too bad, but having it is still nice).

return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))
else:
return make_block(
values, placement=placement or slice(0, len(values), 1),
ndim=self.ndim)


class DatetimeBlock(DatetimeLikeBlockMixin, Block):
__slots__ = ()
Expand Down Expand Up @@ -2711,6 +2735,20 @@ def shift(self, periods, axis=0, mgr=None):
return [self.make_block_same_class(new_values,
placement=self.mgr_locs)]

def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
to_concat = [blk.values for blk in to_concat]
values = _concat._concat_datetime(to_concat, axis=self.ndim - 1)

if is_datetimetz(values):
return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))
else:
return make_block(
values, placement=placement or slice(0, len(values), 1))


class SparseBlock(NonConsolidatableMixIn, Block):
""" implement as a list of sparse arrays of the same dtype """
Expand Down Expand Up @@ -2878,6 +2916,16 @@ def sparse_reindex(self, new_index):
return self.make_block_same_class(values, sparse_index=new_index,
placement=self.mgr_locs)

def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
to_concat = [blk.values for blk in to_concat]
values = _concat._concat_sparse(to_concat)

return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))


def make_block(values, placement, klass=None, ndim=None, dtype=None,
fastpath=False):
Expand Down Expand Up @@ -4517,6 +4565,45 @@ def fast_xs(self, loc):
"""
return self._block.values[loc]

def concat(self, to_concat, new_axis):
"""
Concatenate a list of SingleBlockManagers into a single
SingleBlockManager.

Used for pd.concat of Series objects with axis=0.

Parameters
----------
to_concat : list of SingleBlockManagers
new_axis : Index of the result

Returns
-------
SingleBlockManager

"""
non_empties = [x for x in to_concat if len(x) > 0]

# check if all series are of the same block type:
if len(non_empties) > 0:
blocks = [obj.blocks[0] for obj in non_empties]

if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could some of this logic be moreve to concaty_same_type?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which logic do you mean exactly?
For now, I coded concat_same_type such that is assumes only blocks of the same type are passed (so I don't do the checking there, but before the method is called)

new_block = blocks[0].concat_same_type(blocks)
else:
values = [x.values for x in blocks]
values = _concat._concat_compat(values)
new_block = make_block(
values, placement=slice(0, len(values), 1))
else:
values = [x._block.values for x in to_concat]
values = _concat._concat_compat(values)
new_block = make_block(
values, placement=slice(0, len(values), 1))

mgr = SingleBlockManager(new_block, new_axis)
return mgr


def construction_error(tot_items, block_shape, axes, e=None):
""" raise a helpful message about our construction """
Expand Down Expand Up @@ -5105,13 +5192,42 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
[get_mgr_concatenation_plan(mgr, indexers)
for mgr, indexers in mgrs_indexers], concat_axis)

blocks = [make_block(
concatenate_join_units(join_units, concat_axis, copy=copy),
placement=placement) for placement, join_units in concat_plan]
blocks = []

for placement, join_units in concat_plan:

if is_uniform_join_units(join_units):
b = join_units[0].block.concat_same_type(
[ju.block for ju in join_units], placement=placement)
else:
b = make_block(
concatenate_join_units(join_units, concat_axis, copy=copy),
placement=placement)
blocks.append(b)

return BlockManager(blocks, axes)


def is_uniform_join_units(join_units):
"""
Check if the join units consist of blocks of uniform type that can
be concatenated using Block.concat_same_type instead of the generic
concatenate_join_units (which uses `_concat._concat_compat`).

"""
return (
# all blocks need to have the same type
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a mouthful!

can you put blank lines in between statements

all([type(ju.block) is type(join_units[0].block) for ju in join_units]) and # noqa
# no blocks that would get missing values (can lead to type upcasts)
all([not ju.is_na for ju in join_units]) and
# no blocks with indexers (as then the dimensions do not fit)
all([not ju.indexers for ju in join_units]) and
# disregard Panels
all([ju.block.ndim <= 2 for ju in join_units]) and
# only use this path when there is something to concatenate
len(join_units) > 1)


def get_empty_dtype_and_na(join_units):
"""
Return dtype and N/A values to use when concatenating specified units.
Expand Down
16 changes: 4 additions & 12 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,20 +362,12 @@ def get_result(self):

# stack blocks
if self.axis == 0:
# concat Series with length to keep dtype as much
non_empties = [x for x in self.objs if len(x) > 0]
if len(non_empties) > 0:
values = [x._values for x in non_empties]
else:
values = [x._values for x in self.objs]
new_data = _concat._concat_compat(values)

name = com._consensus_name_attr(self.objs)
cons = _concat._get_series_result_type(new_data)

return (cons(new_data, index=self.new_axes[0],
name=name, dtype=new_data.dtype)
.__finalize__(self, method='concat'))
mgr = self.objs[0]._data.concat([x._data for x in self.objs],
self.new_axes)
cons = _concat._get_series_result_type(mgr, self.objs)
return cons(mgr, name=name).__finalize__(self, method='concat')

# combine as columns in a frame
else:
Expand Down
39 changes: 36 additions & 3 deletions pandas/tests/internals/test_external_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,26 @@
import numpy as np

import pandas as pd
from pandas.core.internals import Block, BlockManager, SingleBlockManager
from pandas.core.internals import (
Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn)


class CustomBlock(Block):
class CustomBlock(NonConsolidatableMixIn, Block):

_holder = np.ndarray

def formatting_values(self):
return np.array(["Val: {}".format(i) for i in self.values])

def concat_same_type(self, to_concat, placement=None):
"""
Always concatenate disregarding self.ndim as the values are
always 1D in this custom Block
"""
values = np.concatenate([blk.values for blk in to_concat])
return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))


def test_custom_repr():
values = np.arange(3, dtype='int64')
Expand All @@ -23,7 +35,28 @@ def test_custom_repr():
assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64'

# dataframe
block = CustomBlock(values.reshape(1, -1), placement=slice(0, 1))
block = CustomBlock(values, placement=slice(0, 1))
blk_mgr = BlockManager([block], [['col'], range(3)])
df = pd.DataFrame(blk_mgr)
assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2'


def test_concat_series():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number here

values = np.arange(3, dtype='int64')
block = CustomBlock(values, placement=slice(0, 3))
s = pd.Series(block, pd.RangeIndex(3), fastpath=True)

res = pd.concat([s, s])
assert isinstance(res._data.blocks[0], CustomBlock)


def test_concat_dataframe():
df = pd.DataFrame({'a': [1, 2, 3]})
blocks = df._data.blocks
values = np.arange(3, dtype='int64')
custom_block = CustomBlock(values, placement=slice(1, 2))
blocks = blocks + (custom_block, )
block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df.index])
df = pd.DataFrame(block_manager)
res = pd.concat([df, df])
assert isinstance(res._data.blocks[1], CustomBlock)