Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C Functionality for Accessing DataFrame Values in Column Order #32343

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 222 additions & 0 deletions pandas/_libs/ndframe_iter.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
#include "assert.h"
#include "ndframe_iter.h"

// returns -1 on error
// performs no bounds checking, so will likely segfault if you pass
// and inappropriate dim for the object
static Py_ssize_t getDimLength(PyObject *df, Py_ssize_t dim) {
PyObject *shape;
Py_ssize_t ncols;

shape = PyObject_GetAttrString(df, "shape");
if (shape == NULL) {
return -1;
}

if (!PyTuple_Check(shape)) {
Py_DECREF(shape);
return -1;
}

ncols = PyLong_AsLongLong(PyTuple_GET_ITEM(shape, dim));
Py_DECREF(shape);

return ncols;
}

// Return the blocks object associated with a dataframe
// Checks that the return value is a Tuple
static PyObject *getBlocksTuple(PyObject *df) {
PyObject *blockManager, *blocks;

blockManager = PyObject_GetAttrString(df, "_data");
if (blockManager == NULL) {
return NULL;
}

blocks = PyObject_GetAttrString(blockManager, "blocks");
Py_DECREF(blockManager);
if (blocks == NULL) {
return NULL;
}

if (!PyTuple_Check(blocks)) {
// TODO: Set error message here
Py_DECREF(blocks);
return NULL;
}

return blocks;
}

// Return the integer placements of the blocks columns in its owning frame
// Checks that the return value is a List
static PyObject *getManagerLocationsAsList(PyObject *block) {
PyObject *managerLocs, *ndarray, *list;

managerLocs = PyObject_GetAttrString(block, "mgr_locs");
if (managerLocs == NULL) {
return NULL;
}

// TODO: we could probably just supply managerLocs to the list()
// built-in instead of going from mgr_locs->as_array->tolist
ndarray = PyObject_GetAttrString(managerLocs, "as_array");
Py_DECREF(managerLocs);
if (ndarray == NULL) {
return NULL;
}

list = PyObject_CallMethod(ndarray, "tolist", NULL);
Py_DECREF(ndarray);
if (list == NULL) {
return NULL;
} else if (!PyList_Check(list)) {
Py_DECREF(list);
return NULL;
}

return list;
}


// Given a DataFrame, this will create a struct containing both
// the length of ndarrays the frame uses along with an array of
// PyArrayObjects, appearing in C-order along the requested axis.
//
// For example, if we had a DataFrame that looks as follows
// a b c
// 0 1 2.0 3
// 1 4 5.0 6
//
// This would deconstruct the blocks and provide a struct
// with len 3; accessing
// ndarrays[0] will yield ndarray([1, 4])
// ndarrays[1] will yield ndarray([2., 5.])
// ndarrays[2] will yield ndarray([3, 6])
//
// The goal of this is to provide a performant way in C to
// maintain axis order and dtype information of the
// supplied DataFrame.
PdOrderedArrays *PdOrderedArrays_New(PyObject *df, int axis) {
PyObject *blocks, *block, *managerLocs, *blockValues, *ndarr, *key;
PyObject **ndarrays;
Py_ssize_t i, j, loc, ncols;
PdOrderedArrays *result;

//
assert(axis == 0);

ncols = getDimLength(df, 1);
blocks = getBlocksTuple(df);
if (blocks == NULL) {
return NULL;
}

ndarrays = PyObject_Malloc(sizeof(PyObject *) * ncols);
if (ndarrays == NULL) {
Py_DECREF(blocks);
return NULL;
}


// Iterate over all of the blocks, getting a slice from
// each block sequentially and storing it in the appropriate order
// in `arrays`
for (i = 0; i < PyTuple_GET_SIZE(blocks); i++) {
block = PyTuple_GET_ITEM(blocks, i);

blockValues = PyObject_CallMethod(block, "get_block_values", NULL);
if (blockValues == NULL) {
Py_DECREF(blocks);
PyObject_Free(ndarrays);
return NULL;
}

managerLocs = getManagerLocationsAsList(block);
if (managerLocs == NULL) {
Py_DECREF(blockValues);
Py_DECREF(blocks);
PyObject_Free(ndarrays);
return NULL;
}

// managerLocs tells use where each column of the block
// exists in the maintaining dataframe, so iterate
// managerLocs sequentially and assign, get the column of
// data and assign that reference to the appropriate position
// in `ndarrays`
for (j = 0; j < PyList_GET_SIZE(managerLocs); j++) {
loc = PyLong_AsLongLong(PyList_GET_ITEM(managerLocs, j));
key = PyLong_FromLongLong(j);
if (key == NULL) {
goto LOOP_ERROR;
}

// TODO: We should actually be grabbing an ndarray reference here
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is rather verbose, I should point out that this comment might be the most helpful in understanding what is going on here (or at least what should be)

// and storing that in ndarrays; I haven't quite figured out
// with the Numpy C-API how to do that though
//
// For illustration, let's assume we have a DataFrame that looks like:
// colA colB colC
// 0 0 X 1
// 1 2 Y 3
//
// If we are currently iterating over the int block, it would look
// something like this:
//
// [[0, 1],
// [2, 3]]
//
// and managerLocs would be [0, 2] (note position of int dtypes in dataframe)
//
// So in this loop we would ideally create a ndarray that references [0, 2]
// (i.e. the first column of the above block) and store a pointer to that
// at arrays[0]. We would then get an array that references [1, 3] and store
// that at arrays[2].
//
// Iteration over the string block should populate an ndarray at arrays[1]
// that holds a reference to ["X", "Y"]
ndarr = PyObject_GetItem(blockValues, key); // TODO: no GetItem; construct ndarray somehow
Py_DECREF(key);
if (ndarr == NULL) {
goto LOOP_ERROR;
}

ndarrays[loc] = ndarr;
continue;
LOOP_ERROR:
Py_DECREF(managerLocs);
Py_DECREF(blockValues);
Py_DECREF(blocks);
PyObject_Free(ndarrays);
return NULL;
}

Py_DECREF(managerLocs);
Py_DECREF(blockValues);
}

Py_DECREF(blocks);

result = PyObject_Malloc(sizeof(PdOrderedArrays));
if (result == NULL) {
return NULL;
}

result->len = ncols;
result->ndarrays = ndarrays;

return result;
}

void PdOrderedArrays_Destroy(PdOrderedArrays *orderedArrays) {
Py_ssize_t i;

for (i = 0; i < orderedArrays->len; i++) {
Py_DECREF(orderedArrays->ndarrays++);
}

PyObject_Free(orderedArrays->ndarrays);
PyObject_Free(orderedArrays);
}
26 changes: 26 additions & 0 deletions pandas/_libs/ndframe_iter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// TODO: could use better naming than "PdBlockIter"
// showing that this is really "deconstructed" block data
// in native form

#ifndef PANDAS__NDFRAME_ITER
#define PANDAS__NDFRAME_ITER

#define PY_SSIZE_T_CLEAN
#include <Python.h>


// Struct containing a pointer to len # of NDArrays
// The order of each item in data should match the
// order specified by the BlockManager
typedef struct {
Py_ssize_t len;
PyObject **ndarrays; // TODO: need some kind of destructor
} PdOrderedArrays;

// Provided a DataFrame and axis deconstructs the block
// data to match the order represented by the BlockManager
// Returns NULL on error
PdOrderedArrays *PdOrderedArrays_New(PyObject *df, int axis);
void PdOrderedArrays_Destroy(PdOrderedArrays *orderedArrays);

#endif
13 changes: 13 additions & 0 deletions pandas/tests/frame/test_native.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import ctypes

import pandas as pd


def test_func():
# TODO: this is hard coded to only work on Mac
libc = ctypes.PyDLL('pandas/_libs/ndframe_data.cpython-37m-darwin.so')
df = pd.DataFrame([[1, "2", 3., 4., 5], [6, "7", 8., 9, 10]])
obj = libc.PdOrderedArrays_New(ctypes.py_object(df), ctypes.c_int(0))
# TODO: Improve test by accessing numpy array elements sequentially
# and testing for appropriate values
#libc.PdOrderedArrays_Destroy(obj)
13 changes: 12 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ def initialize_options(self):
pjoin(ujson_lib, "ultrajsonenc.c"),
pjoin(ujson_lib, "ultrajsondec.c"),
pjoin(util, "move.c"),
pjoin("pandas", "_libs", "ndframe_iter.c"),
]

for root, dirs, files in os.walk("pandas"):
Expand Down Expand Up @@ -730,8 +731,18 @@ def srcpath(name=None, suffix=".pyx", subdir="src"):


extensions.append(ujson_ext)
# ----------------------------------------------------------------------w
# native frame access functions
native_frame_ext = Extension(
"pandas._libs.ndframe_data",
sources=["pandas/_libs/ndframe_iter.c"],
include_dirs=["pandas/_libs"],
define_macros=macros,
depends=["ndframe_iter.h"],
)

# ----------------------------------------------------------------------
extensions.append(native_frame_ext)
# ----------------------------------------------------------------------w


def setup_package():
Expand Down