From 5d156ef13c2d273937a97e044ddad08db1b56e96 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 13 Mar 2021 10:52:38 -0800
Subject: [PATCH 1/3] REF: use public pandas API in dataframe.empty

---
 fastparquet/dataframe.py | 65 ++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py
index 009ccb67..4cb95c03 100644
--- a/fastparquet/dataframe.py
+++ b/fastparquet/dataframe.py
@@ -2,7 +2,6 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
 import numpy as np
-from pandas.core.internals import BlockManager
 from pandas import (
     Categorical, DataFrame, Series,
     CategoricalIndex, RangeIndex, Index, MultiIndex,
@@ -88,14 +87,15 @@ def cat(col):
 
     df = OrderedDict()
     for t, col in zip(types, cols):
+        # Create empty arrays of length 1, so we can call `repeat` below
         if str(t) == 'category':
-            df[str(col)] = Categorical([], categories=cat(col),
+            df[str(col)] = Categorical([-1], categories=cat(col),
                                                  fastpath=True)
         else:
             if hasattr(t, 'base'):
                 # funky pandas not-dtype
                 t = t.base
-            d = np.empty(0, dtype=t)
+            d = np.empty(1, dtype=t)
             if d.dtype.kind == "M" and str(col) in timezones:
                 try:
                     d = Series(d).dt.tz_localize(timezones[str(col)])
@@ -161,48 +161,29 @@ def set_cats(values, i=i, col=col, **kwargs):
             views[col] = d
             views[col+'-catdef'] = x
 
-    axes = [df._data.axes[0], index]
-
-    # allocate and create blocks
-    blocks = []
-    for block in df._data.blocks:
-        if block.is_categorical:
-            categories = block.values.categories
-            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
-            values = Categorical(values=code, categories=categories,
-                                 fastpath=True)
-            new_block = block.make_block_same_class(values=values)
-        elif getattr(block.dtype, 'tz', None):
-            new_shape = (size, )
-            values = np.empty(shape=new_shape, dtype='M8[ns]')
-            new_block = block.make_block_same_class(
-                type(block.values)(values, dtype=block.values.dtype)
-            )
-        else:
-            new_shape = (block.values.shape[0], size)
-            values = np.empty(shape=new_shape, dtype=block.values.dtype)
-            new_block = block.make_block_same_class(values=values)
+    # Create DataFrame with same dtypes and desired length.
+    df = DataFrame(
+        {col: df[col]._values.repeat(size) for col in df.columns},
+        index=index,
+        columns=df.columns,
+    )
 
-        blocks.append(new_block)
+    # create views
+    for col in df.columns:
+        vals = df[col]._values
+        if isinstance(vals, np.ndarray):
+            views[col] = vals
+        elif is_categorical_dtype(vals):
+            views[col] = vals._codes
+            views[col+'-catdef'] = vals
 
-    # create block manager
-    df = DataFrame(BlockManager(blocks, axes))
+        elif hasattr(vals.dtype, "tz"):
+            # datetime64tz, get the ndarray directly backing it
+            views[col] = vals._data
 
-    # create views
-    for block in df._data.blocks:
-        dtype = block.dtype
-        inds = block.mgr_locs.indexer
-        if isinstance(inds, slice):
-            inds = list(range(inds.start, inds.stop, inds.step))
-        for i, ind in enumerate(inds):
-            col = df.columns[ind]
-            if is_categorical_dtype(dtype):
-                views[col] = block.values._codes
-                views[col+'-catdef'] = block.values
-            elif getattr(block.dtype, 'tz', None):
-                views[col] = np.asarray(block.values, dtype='M8[ns]')
-            else:
-                views[col] = block.values[i]
+        else:
+            # catchall, anything that gets here will be an ExtensionArray 
+            views[col] = vals
 
     if index_names:
         df.index.names = [

From 7dc397f3c671756e88d4e9334cf2294d1d4d2db9 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 13 Mar 2021 13:01:57 -0800
Subject: [PATCH 2/3] Address failing tests

---
 fastparquet/dataframe.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py
index 4cb95c03..2ddc24af 100644
--- a/fastparquet/dataframe.py
+++ b/fastparquet/dataframe.py
@@ -89,12 +89,15 @@ def cat(col):
     for t, col in zip(types, cols):
         # Create empty arrays of length 1, so we can call `repeat` below
         if str(t) == 'category':
-            df[str(col)] = Categorical([-1], categories=cat(col),
+            categories = cat(col)
+            code = 0 if len(categories) else -1
+            df[str(col)] = Categorical([code], categories=categories,
                                                  fastpath=True)
         else:
             if hasattr(t, 'base'):
                 # funky pandas not-dtype
                 t = t.base
+            t = {"M8": "M8[ns]", "m8": "m8[ns]"}.get(t, t)
             d = np.empty(1, dtype=t)
             if d.dtype.kind == "M" and str(col) in timezones:
                 try:

From 55abb8701b568ef7d86ca116e673d166748349ad Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 15 Mar 2021 12:07:28 -0700
Subject: [PATCH 3/3] REF/PERF: faster implementation, equally kludgy, somewhat
 more future-proof

---
 fastparquet/dataframe.py | 68 +++++++++++++++++++++++++---------------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py
index 2ddc24af..e8be6d44 100644
--- a/fastparquet/dataframe.py
+++ b/fastparquet/dataframe.py
@@ -2,6 +2,7 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
 import numpy as np
+from pandas.core.internals import BlockManager
 from pandas import (
     Categorical, DataFrame, Series,
     CategoricalIndex, RangeIndex, Index, MultiIndex,
@@ -87,18 +88,14 @@ def cat(col):
 
     df = OrderedDict()
     for t, col in zip(types, cols):
-        # Create empty arrays of length 1, so we can call `repeat` below
         if str(t) == 'category':
-            categories = cat(col)
-            code = 0 if len(categories) else -1
-            df[str(col)] = Categorical([code], categories=categories,
+            df[str(col)] = Categorical([], categories=cat(col),
                                                  fastpath=True)
         else:
             if hasattr(t, 'base'):
                 # funky pandas not-dtype
                 t = t.base
-            t = {"M8": "M8[ns]", "m8": "m8[ns]"}.get(t, t)
-            d = np.empty(1, dtype=t)
+            d = np.empty(0, dtype=t)
             if d.dtype.kind == "M" and str(col) in timezones:
                 try:
                     d = Series(d).dt.tz_localize(timezones[str(col)])
@@ -164,29 +161,50 @@ def set_cats(values, i=i, col=col, **kwargs):
             views[col] = d
             views[col+'-catdef'] = x
 
-    # Create DataFrame with same dtypes and desired length.
-    df = DataFrame(
-        {col: df[col]._values.repeat(size) for col in df.columns},
-        index=index,
-        columns=df.columns,
-    )
+    axes = [df._data.axes[0], index]
 
-    # create views
-    for col in df.columns:
-        vals = df[col]._values
-        if isinstance(vals, np.ndarray):
-            views[col] = vals
-        elif is_categorical_dtype(vals):
-            views[col] = vals._codes
-            views[col+'-catdef'] = vals
+    # Patch our blocks with desired-length arrays.  Kids: don't try this at home.
+    mgr = df._data
+    for block in mgr.blocks:
+        bvalues = block.values
+        shape = list(bvalues.shape)
+        shape[-1] = size
+
+        if isinstance(bvalues, Categorical):
+            categories = bvalues.categories
+            code = np.zeros(shape=shape, dtype=bvalues.codes.dtype)
 
-        elif hasattr(vals.dtype, "tz"):
-            # datetime64tz, get the ndarray directly backing it
-            views[col] = vals._data
+            values = Categorical(values=code, dtype=bvalues.dtype,
+                                 fastpath=True)
 
+        elif getattr(bvalues.dtype, 'tz', None):
+            values = np.empty(shape=shape, dtype='M8[ns]')
+            values = type(bvalues)(values, dtype=bvalues.dtype)
         else:
-            # catchall, anything that gets here will be an ExtensionArray 
-            views[col] = vals
+            # Note: this will break on any ExtensionDtype other than
+            #  Categorical and DatetimeTZ
+            values = np.empty(shape=shape, dtype=bvalues.dtype)
+
+        block.values = values
+
+    mgr.axes[-1] = index
+
+    # create block manager
+    # create views
+    for block in df._data.blocks:
+        dtype = block.dtype
+        inds = block.mgr_locs.indexer
+        if isinstance(inds, slice):
+            inds = list(range(inds.start, inds.stop, inds.step))
+        for i, ind in enumerate(inds):
+            col = df.columns[ind]
+            if is_categorical_dtype(dtype):
+                views[col] = block.values._codes
+                views[col+'-catdef'] = block.values
+            elif getattr(block.dtype, 'tz', None):
+                views[col] = np.asarray(block.values, dtype='M8[ns]')
+            else:
+                views[col] = block.values[i]
 
     if index_names:
         df.index.names = [