complete?

lithomas1 · Sep 14, 2023 · 0ac544d · 0ac544d
1 parent c6af7c9
commit 0ac544d
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 41 deletions.
diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
@@ -63,7 +63,7 @@ def copy(self, dtype=None, ndim: int = 1, layout=None):
         return type(self)(dtype, layout, self.pyclass)
 
 
-class SeriesType(types.ArrayCompatible):
+class SeriesType(types.Type):
     """
     The type class for Series objects.
     """
@@ -150,6 +150,10 @@ def __init__(self, dmm, fe_type) -> None:
             # typed dict
             # It maps from values in the index to their integer positions in the array
             ("hashmap", types.DictType(fe_type.dtype, types.intp)),
+            # Pointer to the Index object this was created from, or that it
+            # boxes to
+            # https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
+            ("parent", types.pyobject)
         ]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
@@ -195,8 +199,20 @@ def pdseries_constructor(context, builder, sig, args):
     return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
 
 
-@lower_builtin(Index, types.Array, types.DictType)
+@lower_builtin(Index, types.Array, types.DictType, types.pyobject)
 def index_constructor_2arg(context, builder, sig, args):
+    (data, hashmap, parent) = args
+    index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+
+    index.data = data
+    index.hashmap = hashmap
+    index.parent = parent
+    return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
+
+@lower_builtin(Index, types.Array, types.DictType)
+def index_constructor_2arg_parent(context, builder, sig, args):
+    # Basically same as index_constructor_1arg, but also lets you specify the
+    # parent object
     (data, hashmap) = args
     index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
 
@@ -230,16 +246,20 @@ def unbox_index(typ, obj, c):
     """
     data_obj = c.pyapi.object_getattr_string(obj, "_data")
     index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
+    # If we see an object array, assume its been validated as only containing strings
+    # We still need to do the conversion though
     index.data = c.unbox(typ.as_array, data_obj).value
     typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
-    # Create an empty typed dict in numba
+    # Create an empty typed dict in numba for the hasmap for indexing
     # equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
     arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
     intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
     hashmap_obj = c.pyapi.call_method(
         typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
     )
     index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
+    # Set the parent for speedy boxing.
+    index.parent = obj
 
     # Decrefs
     c.pyapi.decref(data_obj)
@@ -283,19 +303,36 @@ def box_index(typ, val, c):
     # First build a Numpy array object, then wrap it in a Index
     index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
 
-    # TODO: preserve the original class for the index
-    # Also need preserve the name of the Index
-
-    # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
-    class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
-    array_obj = c.box(typ.as_array, index.data)
-    # this is basically Index._simple_new(array_obj, name_obj) in python
-    index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
-
-    # Decrefs
-    c.pyapi.decref(class_obj)
-    c.pyapi.decref(array_obj)
-    return index_obj
+    res = cgutils.alloca_once_value(c.builder, index.parent)
+
+    # Does parent exist?
+    # (it means already boxed once, or Index same as original df.index or df.columns)
+    # xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
+    with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (has_parent, otherwise):
+        with has_parent:
+            c.pyapi.incref(index.parent)
+        with otherwise:
+            # TODO: preserve the original class for the index
+            # Also need preserve the name of the Index
+            # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
+            class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
+            array_obj = c.box(typ.as_array, index.data)
+            if isinstance(typ.dtype, types.UnicodeCharSeq):
+                # We converted to numpy string dtype, convert back
+                # to object since _simple_new won't do that for uss
+                object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object"))
+                array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,))
+                c.pyapi.decref(object_str_obj)
+            # this is basically Index._simple_new(array_obj, name_obj) in python
+            index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
+            index.parent = index_obj
+            c.pyapi.print_object(index.parent)
+            c.builder.store(index_obj, res)
+
+            # Decrefs
+            c.pyapi.decref(class_obj)
+            c.pyapi.decref(array_obj)
+    return c.builder.load(res)
 
 
 @box(SeriesType)
@@ -333,7 +370,8 @@ def box_series(typ, val, c):
     return series_obj
 
 
-# Add common series reductions
+# Add common series reductions (e.g. mean, sum),
+# and also add common binops (e.g. add, sub, mul, div)
 
 
 def generate_series_reduction(reduction, reduction_method):
@@ -347,6 +385,23 @@ def series_reduction_impl(series):
     return series_reduction
 
 
+def generate_series_binop(binop):
+    @overload(binop)
+    def series_binop(series1, value):
+        if isinstance(series1, SeriesType):
+            if isinstance(value, SeriesType):
+                def series_binop_impl(series1, series2):
+                    # TODO: Check index matching?
+                    return Series(binop(series1.values, series2.values), series1.index, series1.name)
+                return series_binop_impl
+            else:
+                def series_binop_impl(series1, value):
+                    return Series(binop(series1.values, value), series1.index, series1.name)
+                return series_binop_impl
+
+    return series_binop
+
+
 series_reductions = [
     ("sum", np.sum),
     ("mean", np.mean),
@@ -356,6 +411,16 @@ def series_reduction_impl(series):
 for reduction, reduction_method in series_reductions:
     generate_series_reduction(reduction, reduction_method)
 
+series_binops = [
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv
+]
+
+for binop in series_binops:
+    generate_series_binop(binop)
+
 
 # get_loc on Index
 @overload_method(IndexType, "get_loc")

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -771,7 +771,7 @@ def __init__(
         if by_row is not False and by_row != "compat":
             raise ValueError(f"by_row={by_row} not allowed")
         self.engine = engine
-        self.engine_kwargs = engine_kwargs
+        self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs
         super().__init__(
             obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs
         )
@@ -826,11 +826,6 @@ def values(self):
     def apply(self) -> DataFrame | Series:
         """compute the results"""
 
-        if self.engine == "numba" and not self.raw:
-            raise ValueError(
-                "The numba engine in DataFrame.apply can only be used when raw=True"
-            )
-
         # dispatch to handle list-like or dict-like
         if is_list_like(self.func):
             return self.apply_list_or_dict_like()
@@ -1009,9 +1004,6 @@ def apply_standard(self):
         else:
             results, res_index = self.apply_series_numba()
 
-        # print(results)
-        # print(res_index)
-
         # wrap results
         return self.wrap_results(results, res_index)
 
@@ -1083,6 +1075,10 @@ def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
     ) -> Callable[[npt.NDarray], dict[int, Any]]:
         from pandas import Series
+        # Dummy import just to make the extensions loaded in
+        # This isn't an entrypoint since we don't want users
+        # using Series/DF in numba code outside of apply
+        from pandas.core._numba.extensions import SeriesType
 
         numba = import_optional_dependency("numba")
 
@@ -1096,9 +1092,7 @@ def numba_func(values, col_names, df_index):
                 # TODO: No need for the str call?
                 # Need to adapt types to accept UnicodeCharSeq in Series constructor
                 ser = Series(values[:, j], index=df_index, name=str(col_names[j]))
-
                 results[j] = jitted_udf(ser)
-
             return results
 
         return numba_func
@@ -1114,7 +1108,7 @@ def apply_with_numba(self) -> dict[int, Any]:
             col_names_values = col_names_values.astype("U")
         df_index = self.obj.index
 
-        return nb_func(self.values, col_names_values, df_index)
+        return dict(nb_func(self.values, col_names_values, df_index))
 
     @property
     def result_index(self) -> Index:
@@ -1204,7 +1198,10 @@ def series_generator(self) -> Generator[Series, None, None]:
     def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
     ) -> Callable[[npt.NDArray, npt.NDArray, npt.NDArray], dict[int, Any]]:
-        # Unused import just to register the extensions
+        # Dummy import just to make the extensions loaded in
+        # This isn't an entrypoint since we don't want users
+        # using Series/DF in numba code outside of apply
+        from pandas.core._numba.extensions import SeriesType
 
         from pandas import (
             Index,
@@ -1216,16 +1213,15 @@ def generate_numba_apply_func(
         jitted_udf = numba.extending.register_jitable(func)
 
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
-        def numba_func(values, col_names, index_values):
+        def numba_func(values, col_names_index, index_values):
             results = {}
-            col_names_index = Index(col_names)
+            #col_names_index = Index(col_names)
             for i in range(values.shape[0]):
                 # Create the series
                 # TODO: values corrupted without the copy
                 ser = Series(
                     values[i].copy(), index=col_names_index, name=index_values[i]
                 )
-
                 results[i] = jitted_udf(ser)
 
             return results
@@ -1235,22 +1231,28 @@ def numba_func(values, col_names, index_values):
     def apply_with_numba(self) -> dict[int, Any]:
         nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs)
 
-        # Unpack the index and repack it inside the jitted numba function
-        # This is since if we have object dtype and strings we want to convert
-        # to a numpy string dtype (and our regular index doesn't support numpy string dtypes)
-        col_names_values = self.columns._data
-        if col_names_values.dtype == object:
-            if not lib.is_string_array(col_names_values):
+        # Since numpy/numba doesn't support object array of stringswell
+        # we'll do a sketchy thing where if index._data is object
+        # we convert to string and directly set index._data to that,
+        # setting it back after we call the function
+        fixed_obj_dtype = False
+        orig_data = self.columns._data
+        if self.columns._data.dtype == object:
+            if not lib.is_string_array(self.columns._data):
                 raise ValueError(
                     "The numba engine only supports using string or numeric column names"
                 )
-            col_names_values = col_names_values.astype("U")
+            # Remember to set this back!!!
+            self.columns._data = self.columns._data.astype("U")
+            fixed_obj_dtype = True
         index_values = self.obj.index.values
 
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
-        result_nb_dict = nb_func(self.values, col_names_values, index_values)
+        result_nb_dict = nb_func(self.values, self.columns, index_values)
         result_keys, result_values = result_nb_dict.keys(), result_nb_dict.values()
+        if fixed_obj_dtype:
+            self.columns._data = orig_data
         return dict(zip(result_keys, result_values))
         # return dict(nb_func(self.values, col_names_values, index_values))