From 5d156ef13c2d273937a97e044ddad08db1b56e96 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 13 Mar 2021 10:52:38 -0800 Subject: [PATCH 1/3] REF: use public pandas API in dataframe.empty --- fastparquet/dataframe.py | 65 ++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 42 deletions(-) diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py index 009ccb67..4cb95c03 100644 --- a/fastparquet/dataframe.py +++ b/fastparquet/dataframe.py @@ -2,7 +2,6 @@ from collections import OrderedDict from distutils.version import LooseVersion import numpy as np -from pandas.core.internals import BlockManager from pandas import ( Categorical, DataFrame, Series, CategoricalIndex, RangeIndex, Index, MultiIndex, @@ -88,14 +87,15 @@ def cat(col): df = OrderedDict() for t, col in zip(types, cols): + # Create empty arrays of length 1, so we can call `repeat` below if str(t) == 'category': - df[str(col)] = Categorical([], categories=cat(col), + df[str(col)] = Categorical([-1], categories=cat(col), fastpath=True) else: if hasattr(t, 'base'): # funky pandas not-dtype t = t.base - d = np.empty(0, dtype=t) + d = np.empty(1, dtype=t) if d.dtype.kind == "M" and str(col) in timezones: try: d = Series(d).dt.tz_localize(timezones[str(col)]) @@ -161,48 +161,29 @@ def set_cats(values, i=i, col=col, **kwargs): views[col] = d views[col+'-catdef'] = x - axes = [df._data.axes[0], index] - - # allocate and create blocks - blocks = [] - for block in df._data.blocks: - if block.is_categorical: - categories = block.values.categories - code = np.zeros(shape=size, dtype=block.values.codes.dtype) - values = Categorical(values=code, categories=categories, - fastpath=True) - new_block = block.make_block_same_class(values=values) - elif getattr(block.dtype, 'tz', None): - new_shape = (size, ) - values = np.empty(shape=new_shape, dtype='M8[ns]') - new_block = block.make_block_same_class( - type(block.values)(values, dtype=block.values.dtype) - ) - else: - new_shape = (block.values.shape[0], size) - values = np.empty(shape=new_shape, dtype=block.values.dtype) - new_block = block.make_block_same_class(values=values) + # Create DataFrame with same dtypes and desired length. + df = DataFrame( + {col: df[col]._values.repeat(size) for col in df.columns}, + index=index, + columns=df.columns, + ) - blocks.append(new_block) + # create views + for col in df.columns: + vals = df[col]._values + if isinstance(vals, np.ndarray): + views[col] = vals + elif is_categorical_dtype(vals): + views[col] = vals._codes + views[col+'-catdef'] = vals - # create block manager - df = DataFrame(BlockManager(blocks, axes)) + elif hasattr(vals.dtype, "tz"): + # datetime64tz, get the ndarray directly backing it + views[col] = vals._data - # create views - for block in df._data.blocks: - dtype = block.dtype - inds = block.mgr_locs.indexer - if isinstance(inds, slice): - inds = list(range(inds.start, inds.stop, inds.step)) - for i, ind in enumerate(inds): - col = df.columns[ind] - if is_categorical_dtype(dtype): - views[col] = block.values._codes - views[col+'-catdef'] = block.values - elif getattr(block.dtype, 'tz', None): - views[col] = np.asarray(block.values, dtype='M8[ns]') - else: - views[col] = block.values[i] + else: + # catchall, anything that gets here will be an ExtensionArray + views[col] = vals if index_names: df.index.names = [ From 7dc397f3c671756e88d4e9334cf2294d1d4d2db9 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 13 Mar 2021 13:01:57 -0800 Subject: [PATCH 2/3] Address failing tests --- fastparquet/dataframe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py index 4cb95c03..2ddc24af 100644 --- a/fastparquet/dataframe.py +++ b/fastparquet/dataframe.py @@ -89,12 +89,15 @@ def cat(col): for t, col in zip(types, cols): # Create empty arrays of length 1, so we can call `repeat` below if str(t) == 'category': - df[str(col)] = Categorical([-1], categories=cat(col), + categories = cat(col) + code = 0 if len(categories) else -1 + df[str(col)] = Categorical([code], categories=categories, fastpath=True) else: if hasattr(t, 'base'): # funky pandas not-dtype t = t.base + t = {"M8": "M8[ns]", "m8": "m8[ns]"}.get(t, t) d = np.empty(1, dtype=t) if d.dtype.kind == "M" and str(col) in timezones: try: From 55abb8701b568ef7d86ca116e673d166748349ad Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 15 Mar 2021 12:07:28 -0700 Subject: [PATCH 3/3] REF/PERF: faster implementation, equally kludgy, somewhat more future-proof --- fastparquet/dataframe.py | 68 +++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py index 2ddc24af..e8be6d44 100644 --- a/fastparquet/dataframe.py +++ b/fastparquet/dataframe.py @@ -2,6 +2,7 @@ from collections import OrderedDict from distutils.version import LooseVersion import numpy as np +from pandas.core.internals import BlockManager from pandas import ( Categorical, DataFrame, Series, CategoricalIndex, RangeIndex, Index, MultiIndex, @@ -87,18 +88,14 @@ def cat(col): df = OrderedDict() for t, col in zip(types, cols): - # Create empty arrays of length 1, so we can call `repeat` below if str(t) == 'category': - categories = cat(col) - code = 0 if len(categories) else -1 - df[str(col)] = Categorical([code], categories=categories, + df[str(col)] = Categorical([], categories=cat(col), fastpath=True) else: if hasattr(t, 'base'): # funky pandas not-dtype t = t.base - t = {"M8": "M8[ns]", "m8": "m8[ns]"}.get(t, t) - d = np.empty(1, dtype=t) + d = np.empty(0, dtype=t) if d.dtype.kind == "M" and str(col) in timezones: try: d = Series(d).dt.tz_localize(timezones[str(col)]) @@ -164,29 +161,50 @@ def set_cats(values, i=i, col=col, **kwargs): views[col] = d views[col+'-catdef'] = x - # Create DataFrame with same dtypes and desired length. - df = DataFrame( - {col: df[col]._values.repeat(size) for col in df.columns}, - index=index, - columns=df.columns, - ) + axes = [df._data.axes[0], index] - # create views - for col in df.columns: - vals = df[col]._values - if isinstance(vals, np.ndarray): - views[col] = vals - elif is_categorical_dtype(vals): - views[col] = vals._codes - views[col+'-catdef'] = vals + # Patch our blocks with desired-length arrays. Kids: don't try this at home. + mgr = df._data + for block in mgr.blocks: + bvalues = block.values + shape = list(bvalues.shape) + shape[-1] = size + + if isinstance(bvalues, Categorical): + categories = bvalues.categories + code = np.zeros(shape=shape, dtype=bvalues.codes.dtype) - elif hasattr(vals.dtype, "tz"): - # datetime64tz, get the ndarray directly backing it - views[col] = vals._data + values = Categorical(values=code, dtype=bvalues.dtype, + fastpath=True) + elif getattr(bvalues.dtype, 'tz', None): + values = np.empty(shape=shape, dtype='M8[ns]') + values = type(bvalues)(values, dtype=bvalues.dtype) else: - # catchall, anything that gets here will be an ExtensionArray - views[col] = vals + # Note: this will break on any ExtensionDtype other than + # Categorical and DatetimeTZ + values = np.empty(shape=shape, dtype=bvalues.dtype) + + block.values = values + + mgr.axes[-1] = index + + # create block manager + # create views + for block in df._data.blocks: + dtype = block.dtype + inds = block.mgr_locs.indexer + if isinstance(inds, slice): + inds = list(range(inds.start, inds.stop, inds.step)) + for i, ind in enumerate(inds): + col = df.columns[ind] + if is_categorical_dtype(dtype): + views[col] = block.values._codes + views[col+'-catdef'] = block.values + elif getattr(block.dtype, 'tz', None): + views[col] = np.asarray(block.values, dtype='M8[ns]') + else: + views[col] = block.values[i] if index_names: df.index.names = [