Skip to content

Commit

Permalink
complete?
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 committed Sep 14, 2023
1 parent c6af7c9 commit 0ac544d
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 41 deletions.
99 changes: 82 additions & 17 deletions pandas/core/_numba/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def copy(self, dtype=None, ndim: int = 1, layout=None):
return type(self)(dtype, layout, self.pyclass)


class SeriesType(types.ArrayCompatible):
class SeriesType(types.Type):
"""
The type class for Series objects.
"""
Expand Down Expand Up @@ -150,6 +150,10 @@ def __init__(self, dmm, fe_type) -> None:
# typed dict
# It maps from values in the index to their integer positions in the array
("hashmap", types.DictType(fe_type.dtype, types.intp)),
# Pointer to the Index object this was created from, or that it
# boxes to
# https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
("parent", types.pyobject)
]
models.StructModel.__init__(self, dmm, fe_type, members)

Expand Down Expand Up @@ -195,8 +199,20 @@ def pdseries_constructor(context, builder, sig, args):
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())


@lower_builtin(Index, types.Array, types.DictType)
@lower_builtin(Index, types.Array, types.DictType, types.pyobject)
def index_constructor_2arg(context, builder, sig, args):
(data, hashmap, parent) = args
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)

index.data = data
index.hashmap = hashmap
index.parent = parent
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())

@lower_builtin(Index, types.Array, types.DictType)
def index_constructor_2arg_parent(context, builder, sig, args):
# Basically same as index_constructor_1arg, but also lets you specify the
# parent object
(data, hashmap) = args
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)

Expand Down Expand Up @@ -230,16 +246,20 @@ def unbox_index(typ, obj, c):
"""
data_obj = c.pyapi.object_getattr_string(obj, "_data")
index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
# If we see an object array, assume its been validated as only containing strings
# We still need to do the conversion though
index.data = c.unbox(typ.as_array, data_obj).value
typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
# Create an empty typed dict in numba
# Create an empty typed dict in numba for the hasmap for indexing
# equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
hashmap_obj = c.pyapi.call_method(
typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
)
index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
# Set the parent for speedy boxing.
index.parent = obj

# Decrefs
c.pyapi.decref(data_obj)
Expand Down Expand Up @@ -283,19 +303,36 @@ def box_index(typ, val, c):
# First build a Numpy array object, then wrap it in a Index
index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)

# TODO: preserve the original class for the index
# Also need preserve the name of the Index

# class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
array_obj = c.box(typ.as_array, index.data)
# this is basically Index._simple_new(array_obj, name_obj) in python
index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))

# Decrefs
c.pyapi.decref(class_obj)
c.pyapi.decref(array_obj)
return index_obj
res = cgutils.alloca_once_value(c.builder, index.parent)

# Does parent exist?
# (it means already boxed once, or Index same as original df.index or df.columns)
# xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (has_parent, otherwise):
with has_parent:
c.pyapi.incref(index.parent)
with otherwise:
# TODO: preserve the original class for the index
# Also need preserve the name of the Index
# class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
array_obj = c.box(typ.as_array, index.data)
if isinstance(typ.dtype, types.UnicodeCharSeq):
# We converted to numpy string dtype, convert back
# to object since _simple_new won't do that for uss
object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object"))
array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,))
c.pyapi.decref(object_str_obj)
# this is basically Index._simple_new(array_obj, name_obj) in python
index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
index.parent = index_obj
c.pyapi.print_object(index.parent)
c.builder.store(index_obj, res)

# Decrefs
c.pyapi.decref(class_obj)
c.pyapi.decref(array_obj)
return c.builder.load(res)


@box(SeriesType)
Expand Down Expand Up @@ -333,7 +370,8 @@ def box_series(typ, val, c):
return series_obj


# Add common series reductions
# Add common series reductions (e.g. mean, sum),
# and also add common binops (e.g. add, sub, mul, div)


def generate_series_reduction(reduction, reduction_method):
Expand All @@ -347,6 +385,23 @@ def series_reduction_impl(series):
return series_reduction


def generate_series_binop(binop):
@overload(binop)
def series_binop(series1, value):
if isinstance(series1, SeriesType):
if isinstance(value, SeriesType):
def series_binop_impl(series1, series2):
# TODO: Check index matching?
return Series(binop(series1.values, series2.values), series1.index, series1.name)
return series_binop_impl
else:
def series_binop_impl(series1, value):
return Series(binop(series1.values, value), series1.index, series1.name)
return series_binop_impl

return series_binop


series_reductions = [
("sum", np.sum),
("mean", np.mean),
Expand All @@ -356,6 +411,16 @@ def series_reduction_impl(series):
for reduction, reduction_method in series_reductions:
generate_series_reduction(reduction, reduction_method)

series_binops = [
operator.add,
operator.sub,
operator.mul,
operator.truediv
]

for binop in series_binops:
generate_series_binop(binop)


# get_loc on Index
@overload_method(IndexType, "get_loc")
Expand Down
50 changes: 26 additions & 24 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ def __init__(
if by_row is not False and by_row != "compat":
raise ValueError(f"by_row={by_row} not allowed")
self.engine = engine
self.engine_kwargs = engine_kwargs
self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs
super().__init__(
obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs
)
Expand Down Expand Up @@ -826,11 +826,6 @@ def values(self):
def apply(self) -> DataFrame | Series:
"""compute the results"""

if self.engine == "numba" and not self.raw:
raise ValueError(
"The numba engine in DataFrame.apply can only be used when raw=True"
)

# dispatch to handle list-like or dict-like
if is_list_like(self.func):
return self.apply_list_or_dict_like()
Expand Down Expand Up @@ -1009,9 +1004,6 @@ def apply_standard(self):
else:
results, res_index = self.apply_series_numba()

# print(results)
# print(res_index)

# wrap results
return self.wrap_results(results, res_index)

Expand Down Expand Up @@ -1083,6 +1075,10 @@ def generate_numba_apply_func(
func, nogil=True, nopython=True, parallel=False
) -> Callable[[npt.NDarray], dict[int, Any]]:
from pandas import Series
# Dummy import just to make the extensions loaded in
# This isn't an entrypoint since we don't want users
# using Series/DF in numba code outside of apply
from pandas.core._numba.extensions import SeriesType

numba = import_optional_dependency("numba")

Expand All @@ -1096,9 +1092,7 @@ def numba_func(values, col_names, df_index):
# TODO: No need for the str call?
# Need to adapt types to accept UnicodeCharSeq in Series constructor
ser = Series(values[:, j], index=df_index, name=str(col_names[j]))

results[j] = jitted_udf(ser)

return results

return numba_func
Expand All @@ -1114,7 +1108,7 @@ def apply_with_numba(self) -> dict[int, Any]:
col_names_values = col_names_values.astype("U")
df_index = self.obj.index

return nb_func(self.values, col_names_values, df_index)
return dict(nb_func(self.values, col_names_values, df_index))

@property
def result_index(self) -> Index:
Expand Down Expand Up @@ -1204,7 +1198,10 @@ def series_generator(self) -> Generator[Series, None, None]:
def generate_numba_apply_func(
func, nogil=True, nopython=True, parallel=False
) -> Callable[[npt.NDArray, npt.NDArray, npt.NDArray], dict[int, Any]]:
# Unused import just to register the extensions
# Dummy import just to make the extensions loaded in
# This isn't an entrypoint since we don't want users
# using Series/DF in numba code outside of apply
from pandas.core._numba.extensions import SeriesType

from pandas import (
Index,
Expand All @@ -1216,16 +1213,15 @@ def generate_numba_apply_func(
jitted_udf = numba.extending.register_jitable(func)

@numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
def numba_func(values, col_names, index_values):
def numba_func(values, col_names_index, index_values):
results = {}
col_names_index = Index(col_names)
#col_names_index = Index(col_names)
for i in range(values.shape[0]):
# Create the series
# TODO: values corrupted without the copy
ser = Series(
values[i].copy(), index=col_names_index, name=index_values[i]
)

results[i] = jitted_udf(ser)

return results
Expand All @@ -1235,22 +1231,28 @@ def numba_func(values, col_names, index_values):
def apply_with_numba(self) -> dict[int, Any]:
nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs)

# Unpack the index and repack it inside the jitted numba function
# This is since if we have object dtype and strings we want to convert
# to a numpy string dtype (and our regular index doesn't support numpy string dtypes)
col_names_values = self.columns._data
if col_names_values.dtype == object:
if not lib.is_string_array(col_names_values):
# Since numpy/numba doesn't support object array of stringswell
# we'll do a sketchy thing where if index._data is object
# we convert to string and directly set index._data to that,
# setting it back after we call the function
fixed_obj_dtype = False
orig_data = self.columns._data
if self.columns._data.dtype == object:
if not lib.is_string_array(self.columns._data):
raise ValueError(
"The numba engine only supports using string or numeric column names"
)
col_names_values = col_names_values.astype("U")
# Remember to set this back!!!
self.columns._data = self.columns._data.astype("U")
fixed_obj_dtype = True
index_values = self.obj.index.values

# Convert from numba dict to regular dict
# Our isinstance checks in the df constructor don't pass for numbas typed dict
result_nb_dict = nb_func(self.values, col_names_values, index_values)
result_nb_dict = nb_func(self.values, self.columns, index_values)
result_keys, result_values = result_nb_dict.keys(), result_nb_dict.values()
if fixed_obj_dtype:
self.columns._data = orig_data
return dict(zip(result_keys, result_values))
# return dict(nb_func(self.values, col_names_values, index_values))

Expand Down

0 comments on commit 0ac544d

Please sign in to comment.