From 9cfdefde6bbfb0469feb43f6f4ca2d3b387575e7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 1 Jul 2019 11:00:10 -0400 Subject: [PATCH 01/34] Removed block context from objToJSON --- pandas/_libs/src/ujson/python/objToJSON.c | 386 +--------------------- 1 file changed, 13 insertions(+), 373 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index cc87d95bf35d8..8b3bf05b2ea87 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -78,15 +78,6 @@ typedef struct __NpyArrContext { char **columnLabels; } NpyArrContext; -typedef struct __PdBlockContext { - int colIdx; - int ncols; - int transpose; - - int *cindices; // frame column -> block column map - NpyArrContext **npyCtxts; // NpyArrContext for each column -} PdBlockContext; - typedef struct __TypeContext { JSPFN_ITERBEGIN iterBegin; JSPFN_ITEREND iterEnd; @@ -108,7 +99,6 @@ typedef struct __TypeContext { char *cStr; NpyArrContext *npyarr; - PdBlockContext *pdblock; int transpose; char **rowLabels; char **columnLabels; @@ -122,9 +112,6 @@ typedef struct __PyObjectEncoder { // pass through the NpyArrContext when encoding multi-dimensional arrays NpyArrContext *npyCtxtPassthru; - // pass through the PdBlockContext when encoding blocks - PdBlockContext *blkCtxtPassthru; - // pass-through to encode numpy data directly int npyType; void *npyValue; @@ -146,8 +133,6 @@ enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; #define PRINTMARK() -int PdBlock_iterNext(JSOBJ, JSONTypeContext *); - void *initObjToJSON(void) { PyObject *mod_pandas; @@ -200,7 +185,6 @@ static TypeContext *createTypeContext(void) { pc->doubleValue = 0.0; pc->cStr = NULL; pc->npyarr = NULL; - pc->pdblock = NULL; pc->rowLabels = NULL; pc->columnLabels = NULL; pc->transpose = 0; @@ -315,18 +299,6 @@ static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { return ret; } -static int is_simple_frame(PyObject *obj) { - PyObject *check = get_sub_attr(obj, "_data", "is_mixed_type"); - int ret = (check == Py_False); - - if (!check) { - return 0; - } - - Py_DECREF(check); - return ret; -} - static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { PyObject *tmp = PyObject_GetAttrString(obj, attr); Py_ssize_t ret; @@ -803,311 +775,6 @@ char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { return NULL; } -//============================================================================= -// Pandas block iteration functions -// -// Serialises a DataFrame column by column to avoid unnecessary data copies and -// more representative serialisation when dealing with mixed dtypes. -// -// Uses a dedicated NpyArrContext for each column. -//============================================================================= - -void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); - - if (blkCtxt->transpose) { - blkCtxt->colIdx++; - } else { - blkCtxt->colIdx = 0; - } - - NpyArr_freeItemValue(obj, tc); -} - -int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); - - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } - - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - blkCtxt->colIdx++; - return NpyArr_iterNextItem(obj, tc); -} - -char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - npy_intp idx; - PRINTMARK(); - - if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { - idx = blkCtxt->colIdx - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); - } else { - idx = GET_TC(tc)->iterNext != PdBlock_iterNext - ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 - : npyarr->index[npyarr->stridedim]; - - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); - } - return NULL; -} - -char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - npy_intp idx; - PRINTMARK(); - - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { - idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); - } else { - idx = blkCtxt->colIdx; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); - } - return NULL; -} - -int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr; - PRINTMARK(); - - if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { - return 0; - } - - if (blkCtxt->transpose) { - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } - } else { - npyarr = blkCtxt->npyCtxts[0]; - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - return 0; - } - } - - ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt; - GET_TC(tc)->itemValue = obj; - - return 1; -} - -void PdBlockPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); - - if (blkCtxt->transpose) { - // if transposed we exhaust each column before moving to the next - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - } -} - -void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *blocks, *block, *values, *tmp; - PyArrayObject *locs; - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - Py_ssize_t i; - PyArray_Descr *dtype; - NpyIter *iter; - NpyIter_IterNextFunc *iternext; - npy_int64 **dataptr; - npy_int64 colIdx; - npy_intp idx; - - PRINTMARK(); - - i = 0; - blocks = NULL; - dtype = PyArray_DescrFromType(NPY_INT64); - obj = (PyObject *)_obj; - - GET_TC(tc) - ->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; - - blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); - if (!blkCtxt) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - GET_TC(tc)->pdblock = blkCtxt; - - blkCtxt->colIdx = 0; - blkCtxt->transpose = GET_TC(tc)->transpose; - blkCtxt->ncols = get_attr_length(obj, "columns"); - - if (blkCtxt->ncols == 0) { - blkCtxt->npyCtxts = NULL; - blkCtxt->cindices = NULL; - - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - blkCtxt->npyCtxts = - PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols); - if (!blkCtxt->npyCtxts) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - for (i = 0; i < blkCtxt->ncols; i++) { - blkCtxt->npyCtxts[i] = NULL; - } - - blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols); - if (!blkCtxt->cindices) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - blocks = get_sub_attr(obj, "_data", "blocks"); - if (!blocks) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - // force transpose so each NpyArrContext strides down its column - GET_TC(tc)->transpose = 1; - - for (i = 0; i < PyObject_Length(blocks); i++) { - block = get_item(blocks, i); - if (!block) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - tmp = get_values(block); - if (!tmp) { - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - Py_DECREF(block); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - values = PyArray_Transpose((PyArrayObject *)tmp, NULL); - Py_DECREF(tmp); - if (!values) { - Py_DECREF(block); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array"); - if (!locs) { - Py_DECREF(block); - Py_DECREF(values); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER, - NPY_NO_CASTING, dtype); - if (!iter) { - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { - NpyIter_Deallocate(iter); - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter); - do { - colIdx = **dataptr; - idx = NpyIter_GetIterIndex(iter); - - blkCtxt->cindices[colIdx] = idx; - - // Reference freed in Pdblock_iterend - Py_INCREF(values); - GET_TC(tc)->newObj = values; - - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - npyarr = GET_TC(tc)->npyarr; - - // set the dataptr to our desired column and initialise - if (npyarr != NULL) { - npyarr->dataptr += npyarr->stride * idx; - NpyArr_iterNext(obj, tc); - } - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - - blkCtxt->npyCtxts[colIdx] = npyarr; - GET_TC(tc)->newObj = NULL; - } while (iternext(iter)); - - NpyIter_Deallocate(iter); - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - } - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; - -BLKRET: - Py_XDECREF(dtype); - Py_XDECREF(blocks); -} - -void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - int i; - PRINTMARK(); - - GET_TC(tc)->itemValue = NULL; - npyarr = GET_TC(tc)->npyarr; - - blkCtxt = GET_TC(tc)->pdblock; - - if (blkCtxt) { - for (i = 0; i < blkCtxt->ncols; i++) { - npyarr = blkCtxt->npyCtxts[i]; - if (npyarr) { - if (npyarr->array) { - Py_DECREF(npyarr->array); - npyarr->array = NULL; - } - - GET_TC(tc)->npyarr = npyarr; - NpyArr_iterEnd(obj, tc); - - blkCtxt->npyCtxts[i] = NULL; - } - } - - if (blkCtxt->npyCtxts) { - PyObject_Free(blkCtxt->npyCtxts); - } - if (blkCtxt->cindices) { - PyObject_Free(blkCtxt->cindices); - } - PyObject_Free(blkCtxt); - } -} - //============================================================================= // Tuple iteration functions // itemValue is borrowed reference, no ref counting @@ -1467,15 +1134,10 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - if (is_simple_frame(obj)) { - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - Py_INCREF(obj); - GET_TC(tc)->itemValue = obj; - } + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; + } } else { PRINTMARK(); return 0; @@ -2002,21 +1664,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetName = NpyArr_iterGetName; return; } else if (PyObject_TypeCheck(obj, cls_dataframe)) { - if (enc->blkCtxtPassthru) { - PRINTMARK(); - pc->pdblock = enc->blkCtxtPassthru; - tc->type = - (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); - - pc->iterBegin = PdBlockPassThru_iterBegin; - pc->iterEnd = PdBlockPassThru_iterEnd; - pc->iterNext = PdBlock_iterNextItem; - pc->iterGetName = PdBlock_iterGetName; - pc->iterGetValue = NpyArr_iterGetValue; - - enc->blkCtxtPassthru = NULL; - return; - } if (enc->outputFormat == SPLIT) { PRINTMARK(); @@ -2030,22 +1677,16 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } PRINTMARK(); - if (is_simple_frame(obj)) { - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetName = NpyArr_iterGetName; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetName = NpyArr_iterGetName; + + pc->newObj = get_values(obj); + if (!pc->newObj) { + goto INVALID; + } - pc->newObj = get_values(obj); - if (!pc->newObj) { - goto INVALID; - } - } else { - pc->iterBegin = PdBlock_iterBegin; - pc->iterEnd = PdBlock_iterEnd; - pc->iterNext = PdBlock_iterNext; - pc->iterGetName = PdBlock_iterGetName; - } pc->iterGetValue = NpyArr_iterGetValue; if (enc->outputFormat == VALUES) { @@ -2326,7 +1967,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; pyEncoder.npyCtxtPassthru = NULL; - pyEncoder.blkCtxtPassthru = NULL; pyEncoder.npyType = -1; pyEncoder.npyValue = NULL; pyEncoder.datetimeIso = 0; From 6f90e45692a610d06f0fd2601a1cd9f87ba4e9e1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 3 Jul 2019 09:05:12 -0400 Subject: [PATCH 02/34] Moved some block stuff around --- pandas/_libs/src/ujson/python/objToJSON.c | 73 +++++++++++++++-------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 8b3bf05b2ea87..7d03f6e56fee7 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1109,39 +1109,66 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { //============================================================================= void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + Py_ssize_t n_cols; + + // For SPLIT format the index tracks columns->index->data progression + // all other formats use this to index by column GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } + + if (enc->outputFormat == SPLIT) { + PRINTMARK(); + tc->type = JT_OBJECT; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series & index + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } + } else { + n_cols = get_attr_length(obj, "columns"); + if (n_cols == 0) { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + } + } + PRINTMARK(); } int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_ssize_t index; - if (!GET_TC(tc)->cStr) { + Py_ssize_t n_cols; + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + + if (enc->outputFormat == SPLIT) { + if (!GET_TC(tc)->cStr) { return 0; - } + } - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); - } else if (index == 1) { + } else if (index == 1) { memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } else if (index == 2) { + } else if (index == 2) { memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; } - } else { + } else { PRINTMARK(); return 0; - } + } + } else { + n_cols = get_attr_length(obj, "columns"); + if (n_cols == 0) { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + if (index >= n_cols) { + return 0; GET_TC(tc)->index++; PRINTMARK(); @@ -1665,16 +1692,14 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyObject_TypeCheck(obj, cls_dataframe)) { - if (enc->outputFormat == SPLIT) { - PRINTMARK(); - tc->type = JT_OBJECT; - pc->iterBegin = DataFrame_iterBegin; - pc->iterEnd = DataFrame_iterEnd; - pc->iterNext = DataFrame_iterNext; - pc->iterGetValue = DataFrame_iterGetValue; - pc->iterGetName = DataFrame_iterGetName; - return; - } + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; + return; + } + PRINTMARK(); pc->iterBegin = NpyArr_iterBegin; From 0a01ada3bbee8990f5590c7a1f1972c6e937706f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 3 Jul 2019 10:23:14 -0400 Subject: [PATCH 03/34] Added frame context and stubbed iteration --- pandas/_libs/src/ujson/python/objToJSON.c | 53 ++++++++++++++++++----- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 7d03f6e56fee7..54e356318abe4 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -78,6 +78,11 @@ typedef struct __NpyArrContext { char **columnLabels; } NpyArrContext; +typedef struct __PdFrameContext { + PyObject *items; // reference to appropriate iterator + Py_ssize_t index; // current position of iteration in frame +} PdFrameContext; + typedef struct __TypeContext { JSPFN_ITERBEGIN iterBegin; JSPFN_ITEREND iterEnd; @@ -1124,17 +1129,23 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyErr_NoMemory(); } } else { - n_cols = get_attr_length(obj, "columns"); - if (n_cols == 0) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; + // Begin iteration over a dataframe's columns + PyObject *tmp = PyObject_CallMethod(obj, "items"); + + if (tmp == 0) { + return; } + + PyObject *ctx = PyObject_Malloc(size(PdFrameContext)); + ctx->index = 0; + ctx->items = tmp; + GET_TC(tc)->prv = ctx; } PRINTMARK(); } int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t index; Py_ssize_t n_cols; PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; @@ -1157,37 +1168,55 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { if (!GET_TC(tc)->itemValue) { return 0; } + + GET_TC(tc)->index++; } else { PRINTMARK(); return 0; } } else { n_cols = get_attr_length(obj, "columns"); - if (n_cols == 0) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - if (index >= n_cols) { - return 0; + PdFrameContext *ctx = &(GET_TC(tc)->prv); + + if (ctx->index >= n_cols) + return 0; // TODO: does n_cols own a reference here? + + ctx->index++; + } - GET_TC(tc)->index++; PRINTMARK(); return 1; } void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + + if (enc->outputFormat == SPLIT) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + } else { + PyObject_Free(GET_TC(tc)->items); + } enc->outputFormat = enc->originalOutputFormat; - PRINTMARK(); + } JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + if (enc->outputFormat == SPLIT) { return GET_TC(tc)->itemValue; + } else { + // get appropriate object from iterable + } } char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + if (enc->outputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; + } else { + // Return label of array here... + } } //============================================================================= From cb820d12f883ecad6715af4eec884056febae4f8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 10 Jul 2019 12:45:15 -0700 Subject: [PATCH 04/34] checpoint --- pandas/_libs/src/ujson/python/objToJSON.c | 48 ++++++++++++++++------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 54e356318abe4..e0659474a9bdd 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -79,8 +79,8 @@ typedef struct __NpyArrContext { } NpyArrContext; typedef struct __PdFrameContext { - PyObject *items; // reference to appropriate iterator - Py_ssize_t index; // current position of iteration in frame + PyObject *iterable; + PyObject *currItem; } PdFrameContext; typedef struct __TypeContext { @@ -104,6 +104,7 @@ typedef struct __TypeContext { char *cStr; NpyArrContext *npyarr; + PdFrameContext *frame; int transpose; char **rowLabels; char **columnLabels; @@ -190,6 +191,7 @@ static TypeContext *createTypeContext(void) { pc->doubleValue = 0.0; pc->cStr = NULL; pc->npyarr = NULL; + pc->frame = NULL; pc->rowLabels = NULL; pc->columnLabels = NULL; pc->transpose = 0; @@ -1130,16 +1132,23 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { } } else { // Begin iteration over a dataframe's columns - PyObject *tmp = PyObject_CallMethod(obj, "items"); + // TODO: need to free this + PyObject *tmp = PyObject_CallMethod(obj, "items", NULL); if (tmp == 0) { return; } - PyObject *ctx = PyObject_Malloc(size(PdFrameContext)); - ctx->index = 0; - ctx->items = tmp; - GET_TC(tc)->prv = ctx; + PyObject *frameCtxt = (PdFrameContext *)PyObject_Malloc(sizeof(PdFrameContext)); + if (!frameCtxt) { + Py_DECREF(tmp); + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + + frameCtxt->iterable = tmp; + GET_TC(tc)->prv = frameCtxt; } PRINTMARK(); @@ -1175,13 +1184,12 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 0; } } else { - n_cols = get_attr_length(obj, "columns"); - PdFrameContext *ctx = &(GET_TC(tc)->prv); - - if (ctx->index >= n_cols) - return 0; // TODO: does n_cols own a reference here? - - ctx->index++; + // TODO: Need to free these + PyObject *tmp = PyIter_Next(GET_TC(tc)->frame->iterable); + if (tmp == 0) + return 0; + + GET_TC(tc)->frame->currItem = tmp; } PRINTMARK(); @@ -1194,7 +1202,7 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { if (enc->outputFormat == SPLIT) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; } else { - PyObject_Free(GET_TC(tc)->items); + Py_DECREF(GET_TC(tc)->frame->iterable); } enc->outputFormat = enc->originalOutputFormat; @@ -1206,6 +1214,15 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } else { // get appropriate object from iterable + // Borrowed reference + PyObject *values = PyTuple_GetItem(GET_TC(tc)->frame->currItem, 1); + + + if (PyObject_HasAttrString(values, "to_numpy")) + // TODO: Need to free this + values = PyObject_CallMethod(values, "to_numpy", NULL); + + return values; } } @@ -1216,6 +1233,7 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { return GET_TC(tc)->cStr; } else { // Return label of array here... + return PyTuple_GetItem(GET_TC(tc)->frame->currItem, 0); } } From fae5c565eae4bf09f2c6a19a6568a4f83790cd2d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 15 Jul 2019 14:28:23 -0700 Subject: [PATCH 05/34] Compiled though broken :-X --- pandas/_libs/src/ujson/python/objToJSON.c | 100 ++-------------------- 1 file changed, 6 insertions(+), 94 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 5ceb3c953d7e4..fa11f7d73a7ad 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -81,6 +81,7 @@ typedef struct __NpyArrContext { typedef struct __PdFrameContext { PyObject *iterable; PyObject *currItem; + } PdFrameContext; typedef struct __TypeContext { @@ -1149,7 +1150,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { return; } - PyObject *frameCtxt = (PdFrameContext *)PyObject_Malloc(sizeof(PdFrameContext)); + PdFrameContext *frameCtxt = (PdFrameContext *)PyObject_Malloc(sizeof(PdFrameContext)); if (!frameCtxt) { Py_DECREF(tmp); PyErr_NoMemory(); @@ -1158,7 +1159,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { } frameCtxt->iterable = tmp; - GET_TC(tc)->prv = frameCtxt; + tc->prv = frameCtxt; } PRINTMARK(); @@ -1169,6 +1170,8 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->outputFormat == SPLIT) { + Py_ssize_t index; + if (!GET_TC(tc)->cStr) { return 0; } @@ -1754,99 +1757,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterNext = DataFrame_iterNext; pc->iterGetValue = DataFrame_iterGetValue; pc->iterGetName = DataFrame_iterGetName; + return; - } - - - PRINTMARK(); - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetName = NpyArr_iterGetName; - - pc->newObj = get_values(obj); - if (!pc->newObj) { - goto INVALID; - } - - pc->iterGetValue = NpyArr_iterGetValue; - - if (enc->outputFormat == VALUES) { - PRINTMARK(); - tc->type = JT_ARRAY; - } else if (enc->outputFormat == RECORDS) { - PRINTMARK(); - tc->type = JT_ARRAY; - tmpObj = PyObject_GetAttrString(obj, "columns"); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - goto INVALID; - } - } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - PRINTMARK(); - tc->type = JT_OBJECT; - tmpObj = (enc->outputFormat == INDEX - ? PyObject_GetAttrString(obj, "index") - : PyObject_GetAttrString(obj, "columns")); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, pc->rowLabelsLen); - Py_DECREF(tmpObj); - tmpObj = (enc->outputFormat == INDEX - ? PyObject_GetAttrString(obj, "columns") - : PyObject_GetAttrString(obj, "index")); - if (!tmpObj) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - - if (enc->outputFormat == COLUMNS) { - PRINTMARK(); - pc->transpose = 1; - } - } else { - goto INVALID; - } - return; } else if (PyDict_Check(obj)) { PRINTMARK(); tc->type = JT_OBJECT; From 0ab46a466dcbaa277739b74e8a0773415aa7e4fd Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 15 Jul 2019 15:39:10 -0700 Subject: [PATCH 06/34] working serialization --- pandas/_libs/src/ujson/python/objToJSON.c | 49 ++++++++++++----------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index fa11f7d73a7ad..d90d26aeb00ba 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -134,7 +134,9 @@ typedef struct __PyObjectEncoder { PyObject *defaultHandler; } PyObjectEncoder; -#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) +inline TypeContext* GET_TC(JSONTypeContext * __ptrtc) { + return (TypeContext *)__ptrtc->prv; +} enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; @@ -1128,6 +1130,7 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; Py_ssize_t n_cols; + printf("call to dataframe iterbegin\n"); // For SPLIT format the index tracks columns->index->data progression // all other formats use this to index by column @@ -1143,7 +1146,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { } } else { // Begin iteration over a dataframe's columns - // TODO: need to free this + printf("beginning frame iteration\n"); PyObject *tmp = PyObject_CallMethod(obj, "items", NULL); if (tmp == 0) { @@ -1157,9 +1160,11 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; } - + + printf("setting frame iteration context\n"); frameCtxt->iterable = tmp; - tc->prv = frameCtxt; + GET_TC(tc)->frame = frameCtxt; + printf("set frame iteration context!\n"); } PRINTMARK(); @@ -1168,6 +1173,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_ssize_t n_cols; PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + printf("in frame iternext\n"); if (enc->outputFormat == SPLIT) { Py_ssize_t index; @@ -1197,12 +1203,18 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 0; } } else { - // TODO: Need to free these + printf("iterating over dataframe\n"); + // free previous entry + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + PyObject *tmp = PyIter_Next(GET_TC(tc)->frame->iterable); if (tmp == 0) return 0; - GET_TC(tc)->frame->currItem = tmp; + GET_TC(tc)->itemValue = tmp; } PRINTMARK(); @@ -1211,6 +1223,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + printf("done dataframe iteration\n"); if (enc->outputFormat == SPLIT) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; @@ -1222,21 +1235,9 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { } JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + printf("getting dataframe itervalue\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->outputFormat == SPLIT) { - return GET_TC(tc)->itemValue; - } else { - // get appropriate object from iterable - // Borrowed reference - PyObject *values = PyTuple_GetItem(GET_TC(tc)->frame->currItem, 1); - - - if (PyObject_HasAttrString(values, "to_numpy")) - // TODO: Need to free this - values = PyObject_CallMethod(values, "to_numpy", NULL); - - return values; - } + return GET_TC(tc)->itemValue; } char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { @@ -1244,9 +1245,8 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { if (enc->outputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; - } else { - // Return label of array here... - return PyTuple_GetItem(GET_TC(tc)->frame->currItem, 0); + } else { // Pass through to underlying Series + return NULL; } } @@ -1751,7 +1751,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetName = NpyArr_iterGetName; return; } else if (PyObject_TypeCheck(obj, cls_dataframe)) { - + printf("got a frame\n"); + tc->type = JT_OBJECT; pc->iterBegin = DataFrame_iterBegin; pc->iterEnd = DataFrame_iterEnd; pc->iterNext = DataFrame_iterNext; From 4eabc632cb9cc74a7c58beb8f4d87c4d3acf480c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 15 Jul 2019 16:16:41 -0700 Subject: [PATCH 07/34] Working basic example --- pandas/_libs/src/ujson/python/objToJSON.c | 36 +++++++---------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index d90d26aeb00ba..0b417dea3c8fd 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -263,26 +263,6 @@ static PyObject *get_values(PyObject *obj) { } } - if (!values && PyObject_HasAttrString(obj, "_internal_get_values")) { - PRINTMARK(); - values = PyObject_CallMethod(obj, "_internal_get_values", NULL); - if (values && !PyArray_CheckExact(values)) { - PRINTMARK(); - Py_DECREF(values); - values = NULL; - } - } - - if (!values && PyObject_HasAttrString(obj, "get_block_values")) { - PRINTMARK(); - values = PyObject_CallMethod(obj, "get_block_values", NULL); - if (values && !PyArray_CheckExact(values)) { - PRINTMARK(); - Py_DECREF(values); - values = NULL; - } - } - if (!values) { PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); PyObject *repr; @@ -1160,7 +1140,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; } - + printf("setting frame iteration context\n"); frameCtxt->iterable = tmp; GET_TC(tc)->frame = frameCtxt; @@ -1173,7 +1153,6 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_ssize_t n_cols; PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - printf("in frame iternext\n"); if (enc->outputFormat == SPLIT) { Py_ssize_t index; @@ -1236,8 +1215,13 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { printf("getting dataframe itervalue\n"); - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - return GET_TC(tc)->itemValue; + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + if (enc->outputFormat == SPLIT) { + return GET_TC(tc)->itemValue; + } else { + // Borrowed ref + return PyTuple_GetItem(GET_TC(tc)->itemValue, 1); + } } char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { @@ -1245,8 +1229,8 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { if (enc->outputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; - } else { // Pass through to underlying Series - return NULL; + } else { + return PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); } } From c4a6661f0b28947b751dd5774db05056dcfae432 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 16 Jul 2019 11:51:55 -0700 Subject: [PATCH 08/34] Fixed not sizing output buffer --- pandas/_libs/src/ujson/python/objToJSON.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 0b417dea3c8fd..e436c60431663 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1230,7 +1230,9 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } else { - return PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); + char *result = PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); + *outLen = strlen(result); + return result; } } From 787c3bd5418ed8a74a5b9bb9950dceafd02dd920 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 16 Jul 2019 16:49:05 -0700 Subject: [PATCH 09/34] some docstrings --- pandas/_libs/src/ujson/python/objToJSON.c | 50 +++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e436c60431663..e1b3d1e9623d2 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1107,6 +1107,31 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { //============================================================================= // pandas DataFrame iteration functions //============================================================================= + +/* + * Function: DataFrame_iterBegin + * ----------------------------- + * Sets iteration state for dealing with DataFrame objects + * + * obj: JSON object being serialized (should be a DataFrame at Python level) + * tc: shared TypeContext for seralization + * + * Because various orientations are handling by the JSON parser this method + * is responsible for setting the appropriate iterator. + * + * Supported orient formats are: + * + * SPLIT: {index -> [index], columns -> [columns], data -> [values]} + * RECORDS: [{column -> value}, … , {column -> value}] + * INDEX: {index -> {column -> value}} + * COLUMNS: {column -> {index -> value}} + * VALUES: [[value, value, ...], [value, value, ...], ...] + * + * The context of serialization here is dependent upon the orient. + * RECORDS, and VALUES orients would make the context of serialization here + * JT_ARRAY (essentially a sequence we iterate over) whereas the other orients + * require a JT_OBJECT context (whereby we extract keys and values from the DataFrame). + */ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; Py_ssize_t n_cols; @@ -1150,6 +1175,13 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } +/* + * Function: DataFrame_iterNext + * ----------------------------- + * Provides instructions how to appropriately iterate the object. + * + * This is dependent on the orient as mentioned in DataFrame_iterBeing + */ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_ssize_t n_cols; PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; @@ -1200,6 +1232,12 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } +/* + * Function: DataFrame_iterEnd + * ----------------------------- + * Callaback after DataFrame has been entirely iterated upon. + * + */ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; printf("done dataframe iteration\n"); @@ -1213,6 +1251,12 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { } +/* + * Function: DataFrame_iterGetValue + * ----------------------------- + * Provides the value(s) for a particular iteration. This is valid whether + * the type context is JT_OBJECT or JT_ARRAY. + */ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { printf("getting dataframe itervalue\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; @@ -1224,6 +1268,12 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { } } +/* + * Function: DataFrame_iterGetName + * ----------------------------- + * Provides the name for a particular iteration. This is only called if + * the type context is JT_OBJECT, which is dictated by the orient. + */ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->outputFormat == SPLIT) { From 3a6a13b7a5cf476a60763995fb5617c938d2b785 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 16 Jul 2019 17:11:26 -0700 Subject: [PATCH 10/34] Set appropriate type context --- pandas/_libs/src/ujson/python/objToJSON.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e1b3d1e9623d2..06a1d1bd08bb8 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1134,22 +1134,23 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { */ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - Py_ssize_t n_cols; printf("call to dataframe iterbegin\n"); + if (enc->outputFormat == RECORDS || enc->outputFormat == VALUES) + tc->type = JT_ARRAY; + else + tc->type = JT_OBJECT; + // For SPLIT format the index tracks columns->index->data progression // all other formats use this to index by column - GET_TC(tc)->index = 0; - if (enc->outputFormat == SPLIT) { PRINTMARK(); - tc->type = JT_OBJECT; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series & index if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } - } else { + } else if (enc->outputFormat == COLUMNS) { // Begin iteration over a dataframe's columns printf("beginning frame iteration\n"); PyObject *tmp = PyObject_CallMethod(obj, "items", NULL); @@ -1183,7 +1184,6 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { * This is dependent on the orient as mentioned in DataFrame_iterBeing */ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t n_cols; PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->outputFormat == SPLIT) { @@ -1213,7 +1213,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); return 0; } - } else { + } else if (enc->outputFormat == COLUMNS) { printf("iterating over dataframe\n"); // free previous entry if (GET_TC(tc)->itemValue) { @@ -1244,7 +1244,7 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { if (enc->outputFormat == SPLIT) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - } else { + } else if (enc->outputFormat == COLUMNS) { Py_DECREF(GET_TC(tc)->frame->iterable); } enc->outputFormat = enc->originalOutputFormat; @@ -1262,7 +1262,7 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->outputFormat == SPLIT) { return GET_TC(tc)->itemValue; - } else { + } else if (enc->outputFormat == COLUMNS) { // Borrowed ref return PyTuple_GetItem(GET_TC(tc)->itemValue, 1); } @@ -1279,7 +1279,7 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { if (enc->outputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; - } else { + } else if (enc->outputFormat == COLUMNS) { char *result = PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); *outLen = strlen(result); return result; From 615c10400883e9ba65e18683822ddfbacd39c0a2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 10:25:03 -0700 Subject: [PATCH 11/34] Working VALUES iteration --- pandas/_libs/src/ujson/python/objToJSON.c | 82 +++++++++++++---------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 06a1d1bd08bb8..b82caf1dee026 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1134,15 +1134,8 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { */ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - printf("call to dataframe iterbegin\n"); - - if (enc->outputFormat == RECORDS || enc->outputFormat == VALUES) - tc->type = JT_ARRAY; - else - tc->type = JT_OBJECT; - + printf("DataFrame_iterBegin\n"); // For SPLIT format the index tracks columns->index->data progression - // all other formats use this to index by column if (enc->outputFormat == SPLIT) { PRINTMARK(); GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); @@ -1150,27 +1143,42 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } - } else if (enc->outputFormat == COLUMNS) { - // Begin iteration over a dataframe's columns - printf("beginning frame iteration\n"); - PyObject *tmp = PyObject_CallMethod(obj, "items", NULL); + } else if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS) { + // Iterate over a frame's columns + PyObject *iter = PyObject_CallMethod(obj, "items", NULL); + + if (iter == 0) { + return; + } + + PdFrameContext *frameCtxt = (PdFrameContext *)PyObject_Malloc(sizeof(PdFrameContext)); + if (!frameCtxt) { + Py_DECREF(iter); + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + + frameCtxt->iterable = iter; + GET_TC(tc)->frame = frameCtxt; + } else if (enc->outputFormat == VALUES) { + // Iterate over a frame's rows + PyObject *iter = PyObject_CallMethod(obj, "iterrows", NULL); - if (tmp == 0) { + if (iter == 0) { return; } PdFrameContext *frameCtxt = (PdFrameContext *)PyObject_Malloc(sizeof(PdFrameContext)); if (!frameCtxt) { - Py_DECREF(tmp); + Py_DECREF(iter); PyErr_NoMemory(); GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; } - printf("setting frame iteration context\n"); - frameCtxt->iterable = tmp; + frameCtxt->iterable = iter; GET_TC(tc)->frame = frameCtxt; - printf("set frame iteration context!\n"); } PRINTMARK(); @@ -1181,9 +1189,10 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { * ----------------------------- * Provides instructions how to appropriately iterate the object. * - * This is dependent on the orient as mentioned in DataFrame_iterBeing + * This is dependent on the orient as mentioned in DataFrame_iterBegin */ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { + printf("DataFrame_iterNext\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->outputFormat == SPLIT) { @@ -1213,8 +1222,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); return 0; } - } else if (enc->outputFormat == COLUMNS) { - printf("iterating over dataframe\n"); + } else if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS || enc->outputFormat == VALUES) { // free previous entry if (GET_TC(tc)->itemValue) { Py_DECREF(GET_TC(tc)->itemValue); @@ -1239,15 +1247,13 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { * */ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + printf("DataFrame_iterEnd\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - printf("done dataframe iteration\n"); - if (enc->outputFormat == SPLIT) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - } else if (enc->outputFormat == COLUMNS) { + if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS || enc->outputFormat == VALUES) Py_DECREF(GET_TC(tc)->frame->iterable); - } - enc->outputFormat = enc->originalOutputFormat; + + enc->outputFormat = enc->originalOutputFormat; } @@ -1258,11 +1264,11 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { * the type context is JT_OBJECT or JT_ARRAY. */ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { - printf("getting dataframe itervalue\n"); + printf("DataFrame_iterGetValue\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->outputFormat == SPLIT) { return GET_TC(tc)->itemValue; - } else if (enc->outputFormat == COLUMNS) { + } else if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS || enc->outputFormat == VALUES) { // Borrowed ref return PyTuple_GetItem(GET_TC(tc)->itemValue, 1); } @@ -1275,15 +1281,14 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { * the type context is JT_OBJECT, which is dictated by the orient. */ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + printf("DataFrame_iterGetName\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->outputFormat == SPLIT) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; - } else if (enc->outputFormat == COLUMNS) { - char *result = PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); - *outLen = strlen(result); - return result; + if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS) { + GET_TC(tc)->cStr = PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); } + *outLen = strlen(GET_TC(tc)->cStr); + printf("%s\n", GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= @@ -1787,8 +1792,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetName = NpyArr_iterGetName; return; } else if (PyObject_TypeCheck(obj, cls_dataframe)) { - printf("got a frame\n"); - tc->type = JT_OBJECT; + if (enc->outputFormat == RECORDS || enc->outputFormat == VALUES) + tc->type = JT_ARRAY; + else + tc->type = JT_OBJECT; + pc->iterBegin = DataFrame_iterBegin; pc->iterEnd = DataFrame_iterEnd; pc->iterNext = DataFrame_iterNext; From 1af960d7179fc2e851acdbfa78adf4e8ac20c17f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 11:05:42 -0700 Subject: [PATCH 12/34] Working for all formats, save state mutation --- pandas/_libs/src/ujson/python/objToJSON.c | 28 ++++++++++++++++------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index b82caf1dee026..895eeb0f65db2 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -80,7 +80,6 @@ typedef struct __NpyArrContext { typedef struct __PdFrameContext { PyObject *iterable; - PyObject *currItem; } PdFrameContext; @@ -140,7 +139,7 @@ inline TypeContext* GET_TC(JSONTypeContext * __ptrtc) { enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; -#define PRINTMARK() +#define PRINTMARK() printf("%d\n", __LINE__) void *initObjToJSON(void) { @@ -1143,7 +1142,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } - } else if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS) { + } else if (enc->outputFormat == COLUMNS) { // Iterate over a frame's columns PyObject *iter = PyObject_CallMethod(obj, "items", NULL); @@ -1161,7 +1160,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { frameCtxt->iterable = iter; GET_TC(tc)->frame = frameCtxt; - } else if (enc->outputFormat == VALUES) { + } else if (enc->outputFormat == RECORDS || enc->outputFormat == VALUES || enc->outputFormat == INDEX) { // Iterate over a frame's rows PyObject *iter = PyObject_CallMethod(obj, "iterrows", NULL); @@ -1179,6 +1178,13 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { frameCtxt->iterable = iter; GET_TC(tc)->frame = frameCtxt; + + // The RECORDS format essentially generates a JSON array of Series in the + // INDEX format, so set that context during serialization + /* + if (enc->outputFormat == RECORDS) + enc->outputFormat = INDEX; + */ } PRINTMARK(); @@ -1222,7 +1228,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); return 0; } - } else if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS || enc->outputFormat == VALUES) { + } else { // free previous entry if (GET_TC(tc)->itemValue) { Py_DECREF(GET_TC(tc)->itemValue); @@ -1234,6 +1240,12 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 0; GET_TC(tc)->itemValue = tmp; + + // RECORDS orient is essentially creating an array of INDEX + // oriented Series objects so set that here + // state should be reset in DataFrame_iterEnd + if (enc->outputFormat == RECORDS) + enc->outputFormat = INDEX; } PRINTMARK(); @@ -1250,7 +1262,7 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { printf("DataFrame_iterEnd\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS || enc->outputFormat == VALUES) + if (enc->outputFormat != SPLIT) Py_DECREF(GET_TC(tc)->frame->iterable); enc->outputFormat = enc->originalOutputFormat; @@ -1268,7 +1280,7 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->outputFormat == SPLIT) { return GET_TC(tc)->itemValue; - } else if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS || enc->outputFormat == VALUES) { + } else { // Borrowed ref return PyTuple_GetItem(GET_TC(tc)->itemValue, 1); } @@ -1283,7 +1295,7 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { printf("DataFrame_iterGetName\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS) { + if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS || enc->outputFormat == INDEX) { GET_TC(tc)->cStr = PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); } *outLen = strlen(GET_TC(tc)->cStr); From 7eb25f3dcb7f864cafa9aad2627b33dbfcf78eae Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 11:43:11 -0700 Subject: [PATCH 13/34] Fixed segfaults and infinite loops with SPLIT --- pandas/_libs/src/ujson/python/objToJSON.c | 52 +++++++++++++---------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 895eeb0f65db2..e22a9a541c833 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -139,7 +139,7 @@ inline TypeContext* GET_TC(JSONTypeContext * __ptrtc) { enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; -#define PRINTMARK() printf("%d\n", __LINE__) +#define PRINTMARK() void *initObjToJSON(void) { @@ -1138,10 +1138,17 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { if (enc->outputFormat == SPLIT) { PRINTMARK(); GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + + // TODO: changing the outputFormat was in place when this method only dealt with + // the SPLIT orient. Now that this handles quite a few we should probably use + // another method for changing the outputFormat of underlying objects (maybe tc->prv) enc->outputFormat = VALUES; // for contained series & index if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } + GET_TC(tc)->index = 0; + + return; // always return if modifying outputFormat } else if (enc->outputFormat == COLUMNS) { // Iterate over a frame's columns PyObject *iter = PyObject_CallMethod(obj, "items", NULL); @@ -1160,8 +1167,11 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { frameCtxt->iterable = iter; GET_TC(tc)->frame = frameCtxt; + + return; } else if (enc->outputFormat == RECORDS || enc->outputFormat == VALUES || enc->outputFormat == INDEX) { // Iterate over a frame's rows + // Must be freed in DataFrame_iterEnd PyObject *iter = PyObject_CallMethod(obj, "iterrows", NULL); if (iter == 0) { @@ -1181,10 +1191,11 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { // The RECORDS format essentially generates a JSON array of Series in the // INDEX format, so set that context during serialization - /* + if (enc->outputFormat == RECORDS) enc->outputFormat = INDEX; - */ + + return; } PRINTMARK(); @@ -1201,17 +1212,20 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { printf("DataFrame_iterNext\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->outputFormat == SPLIT) { - Py_ssize_t index; - - if (!GET_TC(tc)->cStr) { - return 0; - } + // free previous entry + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + } + // Check the original output format as this may have been modified for + // underlying series' + if (enc->originalOutputFormat == SPLIT) { + Py_ssize_t index; index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); + printf("the index is %d\n", index); if (index == 0) { memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + PRINTMARK(); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); } else if (index == 1) { memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); @@ -1222,19 +1236,13 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { if (!GET_TC(tc)->itemValue) { return 0; } - - GET_TC(tc)->index++; } else { PRINTMARK(); return 0; } + + GET_TC(tc)->index++; } else { - // free previous entry - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } - PyObject *tmp = PyIter_Next(GET_TC(tc)->frame->iterable); if (tmp == 0) return 0; @@ -1244,7 +1252,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { // RECORDS orient is essentially creating an array of INDEX // oriented Series objects so set that here // state should be reset in DataFrame_iterEnd - if (enc->outputFormat == RECORDS) + if (enc->originalOutputFormat == RECORDS) enc->outputFormat = INDEX; } @@ -1262,7 +1270,7 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { printf("DataFrame_iterEnd\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->outputFormat != SPLIT) + if (enc->originalOutputFormat != SPLIT) Py_DECREF(GET_TC(tc)->frame->iterable); enc->outputFormat = enc->originalOutputFormat; @@ -1278,7 +1286,7 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { printf("DataFrame_iterGetValue\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->outputFormat == SPLIT) { + if (enc->originalOutputFormat == SPLIT) { return GET_TC(tc)->itemValue; } else { // Borrowed ref @@ -1295,7 +1303,7 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { printf("DataFrame_iterGetName\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->outputFormat == COLUMNS || enc->outputFormat == RECORDS || enc->outputFormat == INDEX) { + if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == RECORDS || enc->originalOutputFormat == INDEX) { GET_TC(tc)->cStr = PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); } *outLen = strlen(GET_TC(tc)->cStr); From 92cc5323b79c1eb044cc6d2edf7ec591021b9177 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 12:09:32 -0700 Subject: [PATCH 14/34] Fixed cStr handling --- pandas/_libs/src/ujson/python/objToJSON.c | 42 +++++++++++------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e22a9a541c833..0507e7efccda5 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1215,7 +1215,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { // free previous entry if (GET_TC(tc)->itemValue) { Py_DECREF(GET_TC(tc)->itemValue); - } + } // Check the original output format as this may have been modified for // underlying series' @@ -1242,18 +1242,23 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { } GET_TC(tc)->index++; - } else { + } else { PyObject *tmp = PyIter_Next(GET_TC(tc)->frame->iterable); if (tmp == 0) return 0; - GET_TC(tc)->itemValue = tmp; + GET_TC(tc)->itemValue = PySequence_GetItem(tmp, 1); - // RECORDS orient is essentially creating an array of INDEX - // oriented Series objects so set that here - // state should be reset in DataFrame_iterEnd - if (enc->originalOutputFormat == RECORDS) - enc->outputFormat = INDEX; + if (enc->originalOutputFormat != VALUES) { + // TODO: align extension usage of itemValue and cStr + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + } + + GET_TC(tc)->itemValue = PySequence_GetItem(tmp, 0); + } + + Py_DECREF(tmp); } PRINTMARK(); @@ -1285,13 +1290,7 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { */ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { printf("DataFrame_iterGetValue\n"); - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->originalOutputFormat == SPLIT) { - return GET_TC(tc)->itemValue; - } else { - // Borrowed ref - return PyTuple_GetItem(GET_TC(tc)->itemValue, 1); - } + return GET_TC(tc)->itemValue; } /* @@ -1302,13 +1301,14 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { */ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { printf("DataFrame_iterGetName\n"); - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == RECORDS || enc->originalOutputFormat == INDEX) { - GET_TC(tc)->cStr = PyUnicode_AsUTF8(PyTuple_GetItem(GET_TC(tc)->itemValue, 0)); + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + if (enc->originalOutputFormat == SPLIT) { + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; + } else if (enc->originalOutputFormat != VALUES) { + *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); + return PyBytes_AS_STRING(GET_TC(tc)->itemName); } - *outLen = strlen(GET_TC(tc)->cStr); - printf("%s\n", GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; } //============================================================================= From 296b4947c50425340df6b06b7177fce916b1c25a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 13:10:43 -0700 Subject: [PATCH 15/34] more cleanups of c string conversions --- pandas/_libs/src/ujson/python/objToJSON.c | 65 ++++++++++------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 0507e7efccda5..fe3ab6b3b728a 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1134,6 +1134,8 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; printf("DataFrame_iterBegin\n"); + enc->originalOutputFormat = enc->outputFormat; + // For SPLIT format the index tracks columns->index->data progression if (enc->outputFormat == SPLIT) { PRINTMARK(); @@ -1149,30 +1151,14 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->index = 0; return; // always return if modifying outputFormat - } else if (enc->outputFormat == COLUMNS) { - // Iterate over a frame's columns - PyObject *iter = PyObject_CallMethod(obj, "items", NULL); - - if (iter == 0) { - return; - } - - PdFrameContext *frameCtxt = (PdFrameContext *)PyObject_Malloc(sizeof(PdFrameContext)); - if (!frameCtxt) { - Py_DECREF(iter); - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - frameCtxt->iterable = iter; - GET_TC(tc)->frame = frameCtxt; + } else { + char *method; + if (enc->outputFormat == COLUMNS) + method = "items"; + else + method = "iterrows"; - return; - } else if (enc->outputFormat == RECORDS || enc->outputFormat == VALUES || enc->outputFormat == INDEX) { - // Iterate over a frame's rows - // Must be freed in DataFrame_iterEnd - PyObject *iter = PyObject_CallMethod(obj, "iterrows", NULL); + PyObject *iter = PyObject_CallMethod(obj, method, NULL); if (iter == 0) { return; @@ -1192,8 +1178,9 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { // The RECORDS format essentially generates a JSON array of Series in the // INDEX format, so set that context during serialization - if (enc->outputFormat == RECORDS) + if (enc->outputFormat == RECORDS) { enc->outputFormat = INDEX; + } return; } @@ -1222,7 +1209,6 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { if (enc->originalOutputFormat == SPLIT) { Py_ssize_t index; index = GET_TC(tc)->index; - printf("the index is %d\n", index); if (index == 0) { memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); PRINTMARK(); @@ -1242,20 +1228,22 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { } GET_TC(tc)->index++; - } else { + } else { + if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { + // TODO: align extension usage of itemName and cStr + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + } + } + PyObject *tmp = PyIter_Next(GET_TC(tc)->frame->iterable); if (tmp == 0) return 0; GET_TC(tc)->itemValue = PySequence_GetItem(tmp, 1); - if (enc->originalOutputFormat != VALUES) { - // TODO: align extension usage of itemValue and cStr - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - } - - GET_TC(tc)->itemValue = PySequence_GetItem(tmp, 0); + if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { + GET_TC(tc)->itemName = PySequence_GetItem(tmp, 0); } Py_DECREF(tmp); @@ -1275,8 +1263,10 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { printf("DataFrame_iterEnd\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->originalOutputFormat != SPLIT) + if (enc->originalOutputFormat != SPLIT) { Py_DECREF(GET_TC(tc)->frame->iterable); + Py_DECREF(GET_TC(tc)->frame); + } enc->outputFormat = enc->originalOutputFormat; @@ -1305,9 +1295,10 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { if (enc->originalOutputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; - } else if (enc->originalOutputFormat != VALUES) { - *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); - return PyBytes_AS_STRING(GET_TC(tc)->itemName); + } else if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { + const char * cStr = PyUnicode_AsUTF8(GET_TC(tc)->itemName); + *outLen = strlen(cStr); + return cStr; } } From f67ecf96d7ade6a808ec0c145f84141aac2fbfde Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 13:19:17 -0700 Subject: [PATCH 16/34] Removed print statements --- pandas/_libs/src/ujson/python/objToJSON.c | 5 ----- pandas/tests/io/json/test_pandas.py | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index fe3ab6b3b728a..106fed49689ac 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1133,7 +1133,6 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { */ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - printf("DataFrame_iterBegin\n"); enc->originalOutputFormat = enc->outputFormat; // For SPLIT format the index tracks columns->index->data progression @@ -1196,7 +1195,6 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { * This is dependent on the orient as mentioned in DataFrame_iterBegin */ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - printf("DataFrame_iterNext\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; // free previous entry @@ -1260,7 +1258,6 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { * */ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - printf("DataFrame_iterEnd\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->originalOutputFormat != SPLIT) { @@ -1279,7 +1276,6 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { * the type context is JT_OBJECT or JT_ARRAY. */ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { - printf("DataFrame_iterGetValue\n"); return GET_TC(tc)->itemValue; } @@ -1290,7 +1286,6 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { * the type context is JT_OBJECT, which is dictated by the orient. */ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { - printf("DataFrame_iterGetName\n"); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->originalOutputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9c687f036aa68..81172b2187581 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -416,6 +416,7 @@ def _check_all_orients( _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) _check_all_orients(self.intframe, dtype=False) + breakpoint() # big one # index and columns are strings as all unserialised JSON object keys # are assumed to be strings From 466380c8c05ef8d84417a916606a49318230044b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 13:40:56 -0700 Subject: [PATCH 17/34] Removed breakpoint --- pandas/tests/io/json/test_pandas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 81172b2187581..9c687f036aa68 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -416,7 +416,6 @@ def _check_all_orients( _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) _check_all_orients(self.intframe, dtype=False) - breakpoint() # big one # index and columns are strings as all unserialised JSON object keys # are assumed to be strings From 9c7b2e1ca1be1c0b48f603f45dc7469a226663ef Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 14:14:50 -0700 Subject: [PATCH 18/34] Fixed issue with CategoricalIndex --- pandas/_libs/src/ujson/python/objToJSON.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 106fed49689ac..c6a6cab3b9452 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -262,6 +262,16 @@ static PyObject *get_values(PyObject *obj) { } } + if (!values && PyObject_HasAttrString(obj, "_internal_get_values")) { + PRINTMARK(); + values = PyObject_CallMethod(obj, "_internal_get_values", NULL); + if (values && !PyArray_CheckExact(values)) { + PRINTMARK(); + Py_DECREF(values); + values = NULL; + } + } + if (!values) { PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); PyObject *repr; @@ -1713,6 +1723,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } + printf("getting values\n"); pc->newObj = get_values(obj); if (pc->newObj) { PRINTMARK(); @@ -1723,6 +1734,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetValue = NpyArr_iterGetValue; pc->iterGetName = NpyArr_iterGetName; } else { + printf("fudge\n"); goto INVALID; } From a6da784b3e66e94597e044d738c052997e243944 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 14:18:56 -0700 Subject: [PATCH 19/34] Removed prints --- pandas/_libs/src/ujson/python/objToJSON.c | 2 -- pandas/tests/io/json/test_pandas.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c6a6cab3b9452..f5e100d62207f 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1723,7 +1723,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } - printf("getting values\n"); pc->newObj = get_values(obj); if (pc->newObj) { PRINTMARK(); @@ -1734,7 +1733,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetValue = NpyArr_iterGetValue; pc->iterGetName = NpyArr_iterGetName; } else { - printf("fudge\n"); goto INVALID; } diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9c687f036aa68..2313833c7ece1 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -449,6 +449,7 @@ def _check_all_orients( ) # time series data + breakpoint() _check_all_orients(self.tsframe) # mixed data From f6726abde028ad9bff7a75bf19e13295b82bae22 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 13:10:29 -0700 Subject: [PATCH 20/34] Going route of chaging NpyArr_encodeLabels --- pandas/_libs/src/ujson/python/objToJSON.c | 79 +++++++++++++++++++---- 1 file changed, 66 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f5e100d62207f..55b9af3c5cd99 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1142,9 +1142,11 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { * require a JT_OBJECT context (whereby we extract keys and values from the DataFrame). */ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + PRINTMARK(); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; enc->originalOutputFormat = enc->outputFormat; - + + GET_TC(tc)->index = 0; // For SPLIT format the index tracks columns->index->data progression if (enc->outputFormat == SPLIT) { PRINTMARK(); @@ -1157,7 +1159,6 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } - GET_TC(tc)->index = 0; return; // always return if modifying outputFormat } else { @@ -1205,6 +1206,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { * This is dependent on the orient as mentioned in DataFrame_iterBegin */ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { + PRINTMARK(); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; // free previous entry @@ -1235,7 +1237,6 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 0; } - GET_TC(tc)->index++; } else { if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { // TODO: align extension usage of itemName and cStr @@ -1257,6 +1258,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_DECREF(tmp); } + GET_TC(tc)->index++; PRINTMARK(); return 1; } @@ -1268,6 +1270,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { * */ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + PRINTMARK(); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->originalOutputFormat != SPLIT) { @@ -1286,6 +1289,7 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { * the type context is JT_OBJECT or JT_ARRAY. */ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + PRINTMARK(); return GET_TC(tc)->itemValue; } @@ -1296,12 +1300,16 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { * the type context is JT_OBJECT, which is dictated by the orient. */ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + PRINTMARK(); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->originalOutputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } else if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { - const char * cStr = PyUnicode_AsUTF8(GET_TC(tc)->itemName); + printf("Current index is %d\n", GET_TC(tc)->index); + // TODO: index is incremented before iteration completes which is unfortunate + // Need to align with SPLIT format and then can maybe increment here for clarity + const char * cStr = GET_TC(tc)->columnLabels[GET_TC(tc)->index - 1]; *outLen = strlen(cStr); return cStr; } @@ -1374,8 +1382,28 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { } } + +/* function NpyArr_encodeLabels + * ---------------------------- + * + * labels: a list-like object of labels to encode + * enc: JSON encoder + * num: number of labels + * should_quote: bool as to whether or not quoting should be done here + * + * This function takes care of encoding labels in one pass and is + * typically used for the columns or labels when a DataFrame or Series. + * It is particularly useful for items whose str repr is not what should + * be written out as a label (ex: Timestamp) + * + * However, it's usage here is rather non-idiomatic as it would be better + * to simply define a iterGetName method for the appropriate objects which + * converts the labels into the appropriate string. + * + * TODO: refactor this to fit better into ujson iteration model + */ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, - npy_intp num) { + npy_intp num, int should_quote) { // NOTE this function steals a reference to labels. PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; @@ -1446,7 +1474,9 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, break; } - need_quotes = ((*cLabel) != '"'); + need_quotes = should_quote && ((*cLabel) != '"'); + printf("need_quotes is %d\n", need_quotes); + printf("result is %s\n", cLabel); len = enc->offset - cLabel + 1 + 2 * need_quotes; ret[i] = PyObject_Malloc(sizeof(char) * len); @@ -1769,7 +1799,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, (JSONObjectEncoder *)enc, - pc->columnLabelsLen); + pc->columnLabelsLen, 1); if (!pc->columnLabels) { goto INVALID; } @@ -1812,14 +1842,37 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_ARRAY; else tc->type = JT_OBJECT; + + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { + PRINTMARK(); + tc->type = JT_OBJECT; + tmpObj = PyObject_GetAttrString(obj, "columns"); + if (!tmpObj) { + goto INVALID; + } + values = get_values(tmpObj); + Py_DECREF(tmpObj); + if (!values) { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(values, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, + (JSONObjectEncoder *)enc, + pc->columnLabelsLen, 0); + printf("first item is %s\n", pc->columnLabels[0]); + printf("second item is %s\n", pc->columnLabels[1]); + if (!pc->columnLabels) { + goto INVALID; + } + } - pc->iterBegin = DataFrame_iterBegin; - pc->iterEnd = DataFrame_iterEnd; - pc->iterNext = DataFrame_iterNext; - pc->iterGetValue = DataFrame_iterGetValue; - pc->iterGetName = DataFrame_iterGetName; + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; - return; + return; } else if (PyDict_Check(obj)) { PRINTMARK(); tc->type = JT_OBJECT; From 48d5a26a55f09a2e83ae9de4344c4c4b9f5e937e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 13:46:26 -0700 Subject: [PATCH 21/34] Hack to get labels working --- pandas/_libs/src/ujson/python/objToJSON.c | 27 ++++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 55b9af3c5cd99..0e8dbd0c8618e 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1306,11 +1306,20 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } else if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { - printf("Current index is %d\n", GET_TC(tc)->index); // TODO: index is incremented before iteration completes which is unfortunate // Need to align with SPLIT format and then can maybe increment here for clarity - const char * cStr = GET_TC(tc)->columnLabels[GET_TC(tc)->index - 1]; - *outLen = strlen(cStr); + int index = GET_TC(tc)->index - 1; + char * cStr = GET_TC(tc)->columnLabels[index]; + + // TODO: we are manually removing quotes and a trailing colon because ujson will add as required + // but we already have them from the NpyArr_encodeLabels call. Should comprehensively + // refactor to get rid of that method entirely so as not to do this + int len = strlen(cStr); + // increment pointer to next character and remove trailing quote + cStr++; + cStr[len - 3] = 0; + *outLen = len - 3; + return cStr; } } @@ -1403,7 +1412,7 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { * TODO: refactor this to fit better into ujson iteration model */ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, - npy_intp num, int should_quote) { + npy_intp num) { // NOTE this function steals a reference to labels. PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; @@ -1474,9 +1483,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, break; } - need_quotes = should_quote && ((*cLabel) != '"'); - printf("need_quotes is %d\n", need_quotes); - printf("result is %s\n", cLabel); + need_quotes = ((*cLabel) != '"'); len = enc->offset - cLabel + 1 + 2 * need_quotes; ret[i] = PyObject_Malloc(sizeof(char) * len); @@ -1799,7 +1806,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, (JSONObjectEncoder *)enc, - pc->columnLabelsLen, 1); + pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; } @@ -1858,9 +1865,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->columnLabelsLen = PyArray_DIM(values, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, (JSONObjectEncoder *)enc, - pc->columnLabelsLen, 0); - printf("first item is %s\n", pc->columnLabels[0]); - printf("second item is %s\n", pc->columnLabels[1]); + pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; } From 96476ddc6dd3a05f2b80d1dcb4bd58b601640ea2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 13:48:13 -0700 Subject: [PATCH 22/34] comments --- pandas/_libs/src/ujson/python/objToJSON.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 0e8dbd0c8618e..a16574d9be81f 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1850,6 +1850,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { else tc->type = JT_OBJECT; + // TODO: calling these columnLabels with INDEX formatting is confusing, + // but there's not really a need to have both columnLabels and rowLabels + // anyway; subsequent refactor should just make these labels if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { PRINTMARK(); tc->type = JT_OBJECT; From 655d96ef7a91df945aec5d0137557e9115473e1b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 13:57:16 -0700 Subject: [PATCH 23/34] Working implementation for almost all cases --- pandas/_libs/src/ujson/python/objToJSON.c | 9 ++++++++- pandas/tests/io/json/test_pandas.py | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index a16574d9be81f..24c3dc5fb0ecc 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1856,7 +1856,14 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { PRINTMARK(); tc->type = JT_OBJECT; - tmpObj = PyObject_GetAttrString(obj, "columns"); + + char *attr; + if (enc->outputFormat == INDEX) + attr = "index"; + else + attr = "columns"; + + tmpObj = PyObject_GetAttrString(obj, attr); if (!tmpObj) { goto INVALID; } diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 2313833c7ece1..d17f93b8d2f1a 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -82,6 +82,7 @@ def setup(self, datapath): del self.tsframe del self.mixed_frame + @pytest.mark.skip("Need to change JSON approach") def test_frame_double_encoded_labels(self): df = DataFrame( [["a", "b"], ["c", "d"]], @@ -145,6 +146,7 @@ def _check(df): ) assert_frame_equal(result, df) + for o in [ [["a", "b"], ["c", "d"]], [[1.5, 2.5], [3.5, 4.5]], @@ -449,7 +451,6 @@ def _check_all_orients( ) # time series data - breakpoint() _check_all_orients(self.tsframe) # mixed data From 2c29a6f68f798a6b30212572bcd689b7ccdccb52 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 13:58:17 -0700 Subject: [PATCH 24/34] Removed errant whitespace --- pandas/tests/io/json/test_pandas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d17f93b8d2f1a..7d5f18b52dc72 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -146,7 +146,6 @@ def _check(df): ) assert_frame_equal(result, df) - for o in [ [["a", "b"], ["c", "d"]], [[1.5, 2.5], [3.5, 4.5]], From fb50c436f9de3f93993aac62778b6edee32e16e0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 13:58:58 -0700 Subject: [PATCH 25/34] Removed unnused doc param --- pandas/_libs/src/ujson/python/objToJSON.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 24c3dc5fb0ecc..e82dd5e6e7b1b 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1398,7 +1398,6 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { * labels: a list-like object of labels to encode * enc: JSON encoder * num: number of labels - * should_quote: bool as to whether or not quoting should be done here * * This function takes care of encoding labels in one pass and is * typically used for the columns or labels when a DataFrame or Series. From a742b72b1d3329e3c4d769b16770dbfc0030a74d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 15:02:42 -0700 Subject: [PATCH 26/34] Removed PdFrameContext --- pandas/_libs/src/ujson/python/objToJSON.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e82dd5e6e7b1b..286ddb821e6f2 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -78,10 +78,6 @@ typedef struct __NpyArrContext { char **columnLabels; } NpyArrContext; -typedef struct __PdFrameContext { - PyObject *iterable; - -} PdFrameContext; typedef struct __TypeContext { JSPFN_ITERBEGIN iterBegin; @@ -104,7 +100,6 @@ typedef struct __TypeContext { char *cStr; NpyArrContext *npyarr; - PdFrameContext *frame; int transpose; char **rowLabels; char **columnLabels; @@ -193,7 +188,6 @@ static TypeContext *createTypeContext(void) { pc->doubleValue = 0.0; pc->cStr = NULL; pc->npyarr = NULL; - pc->frame = NULL; pc->rowLabels = NULL; pc->columnLabels = NULL; pc->transpose = 0; @@ -1174,16 +1168,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { return; } - PdFrameContext *frameCtxt = (PdFrameContext *)PyObject_Malloc(sizeof(PdFrameContext)); - if (!frameCtxt) { - Py_DECREF(iter); - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - frameCtxt->iterable = iter; - GET_TC(tc)->frame = frameCtxt; + GET_TC(tc)->iterator = iter; // The RECORDS format essentially generates a JSON array of Series in the // INDEX format, so set that context during serialization @@ -1245,7 +1230,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { } } - PyObject *tmp = PyIter_Next(GET_TC(tc)->frame->iterable); + PyObject *tmp = PyIter_Next(GET_TC(tc)->iterator); if (tmp == 0) return 0; @@ -1274,8 +1259,7 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->originalOutputFormat != SPLIT) { - Py_DECREF(GET_TC(tc)->frame->iterable); - Py_DECREF(GET_TC(tc)->frame); + Py_DECREF(GET_TC(tc)->iterator); } enc->outputFormat = enc->originalOutputFormat; From 800308768a08b2d79e2adad69b63897a9694bdcf Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 15:05:23 -0700 Subject: [PATCH 27/34] Removed unused transpose struct member --- pandas/_libs/src/ujson/python/objToJSON.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 286ddb821e6f2..5fdf032d65aeb 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -100,7 +100,6 @@ typedef struct __TypeContext { char *cStr; NpyArrContext *npyarr; - int transpose; char **rowLabels; char **columnLabels; npy_intp rowLabelsLen; @@ -190,7 +189,6 @@ static TypeContext *createTypeContext(void) { pc->npyarr = NULL; pc->rowLabels = NULL; pc->columnLabels = NULL; - pc->transpose = 0; pc->rowLabelsLen = 0; pc->columnLabelsLen = 0; @@ -639,19 +637,11 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->curdim = 0; npyarr->type_num = PyArray_DESCR(obj)->type_num; - if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; - } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; - } + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; npyarr->columnLabels = GET_TC(tc)->columnLabels; npyarr->rowLabels = GET_TC(tc)->rowLabels; From 8218515f718a3eede40b0a79c7aa6275d5f0e0ae Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 15:10:50 -0700 Subject: [PATCH 28/34] Remove more block code --- pandas/_libs/src/ujson/python/objToJSON.c | 44 +---------------------- 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 5fdf032d65aeb..b4a7054073aa6 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -254,6 +254,7 @@ static PyObject *get_values(PyObject *obj) { } } + // For Categorical et al if (!values && PyObject_HasAttrString(obj, "_internal_get_values")) { PRINTMARK(); values = PyObject_CallMethod(obj, "_internal_get_values", NULL); @@ -287,36 +288,6 @@ static PyObject *get_values(PyObject *obj) { return values; } -static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - PyObject *ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_GetAttrString(tmp, subAttr); - Py_DECREF(tmp); - - return ret; -} - -static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - Py_ssize_t ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_Length(tmp); - Py_DECREF(tmp); - - if (ret == -1) { - return 0; - } - - return ret; -} - static npy_int64 get_long_attr(PyObject *o, const char *attr) { npy_int64 long_val; PyObject *value = PyObject_GetAttrString(o, attr); @@ -334,19 +305,6 @@ static npy_float64 total_seconds(PyObject *td) { return double_val; } -static PyObject *get_item(PyObject *obj, Py_ssize_t i) { - PyObject *tmp = PyLong_FromSsize_t(i); - PyObject *ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_GetItem(obj, tmp); - Py_DECREF(tmp); - - return ret; -} - static void *CDouble(JSOBJ obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { PRINTMARK(); From 04cb5087db9e8dd755b59928d83beaff76fa5e7a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 15:18:36 -0700 Subject: [PATCH 29/34] Fixed issue with label encoding --- pandas/_libs/src/ujson/python/objToJSON.c | 16 ++++------------ pandas/tests/io/json/test_pandas.py | 1 - 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index b4a7054073aa6..a6a658ff5b755 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1241,18 +1241,10 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { // TODO: index is incremented before iteration completes which is unfortunate // Need to align with SPLIT format and then can maybe increment here for clarity int index = GET_TC(tc)->index - 1; - char * cStr = GET_TC(tc)->columnLabels[index]; - - // TODO: we are manually removing quotes and a trailing colon because ujson will add as required - // but we already have them from the NpyArr_encodeLabels call. Should comprehensively - // refactor to get rid of that method entirely so as not to do this - int len = strlen(cStr); - // increment pointer to next character and remove trailing quote - cStr++; - cStr[len - 3] = 0; - *outLen = len - 3; - - return cStr; + + // Also TODO: return a value here rather than having NpyArr_getLabel modify output buf + NpyArr_getLabel(obj, tc, outLen, index, GET_TC(tc)->columnLabels); + return NULL; } } diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7d5f18b52dc72..9c687f036aa68 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -82,7 +82,6 @@ def setup(self, datapath): del self.tsframe del self.mixed_frame - @pytest.mark.skip("Need to change JSON approach") def test_frame_double_encoded_labels(self): df = DataFrame( [["a", "b"], ["c", "d"]], From a92c96b491ade5c665ce33e73a2ea420d3c5f080 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 15:20:25 -0700 Subject: [PATCH 30/34] Whitespace cleanup --- pandas/_libs/src/ujson/python/objToJSON.c | 26 +++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index a6a658ff5b755..3695e54ed7817 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1177,7 +1177,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_DECREF(GET_TC(tc)->itemName); } } - + PyObject *tmp = PyIter_Next(GET_TC(tc)->iterator); if (tmp == 0) return 0; @@ -1191,30 +1191,30 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_DECREF(tmp); } - GET_TC(tc)->index++; + GET_TC(tc)->index++; PRINTMARK(); return 1; } -/* +/* * Function: DataFrame_iterEnd * ----------------------------- * Callaback after DataFrame has been entirely iterated upon. * */ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PRINTMARK(); + PRINTMARK(); PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - + if (enc->originalOutputFormat != SPLIT) { Py_DECREF(GET_TC(tc)->iterator); } - + enc->outputFormat = enc->originalOutputFormat; - + } -/* +/* * Function: DataFrame_iterGetValue * ----------------------------- * Provides the value(s) for a particular iteration. This is valid whether @@ -1225,15 +1225,15 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -/* +/* * Function: DataFrame_iterGetName * ----------------------------- - * Provides the name for a particular iteration. This is only called if + * Provides the name for a particular iteration. This is only called if * the type context is JT_OBJECT, which is dictated by the orient. */ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { PRINTMARK(); - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; if (enc->originalOutputFormat == SPLIT) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; @@ -1803,13 +1803,13 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } } - + pc->iterBegin = DataFrame_iterBegin; pc->iterEnd = DataFrame_iterEnd; pc->iterNext = DataFrame_iterNext; pc->iterGetValue = DataFrame_iterGetValue; pc->iterGetName = DataFrame_iterGetName; - + return; } else if (PyDict_Check(obj)) { PRINTMARK(); From f60a1398c07cbb759af69646c0ee3063fc74da24 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 15:24:42 -0700 Subject: [PATCH 31/34] Wide frame benchmark --- asv_bench/benchmarks/io/json.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 0ce42856fb14a..e3300995b852e 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -145,10 +145,15 @@ def time_float_int_str_lines(self, orient): class ToJSONMem: def setup_cache(self): df = DataFrame([[1]]) - frames = {"int": df, "float": df.astype(float)} + wide = DataFrame(np.ones((10_000, 10_000))) + frames = {"int": df, "float": df.astype(float), "wide": wide} return frames + def mem_int(self, frames): + df = frames["wide"] + wide.to_json() + def peakmem_int(self, frames): df = frames["int"] for _ in range(100_000): From 1de936f6ceb0ef48956dbb5bb915e8fa2e915ac4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 19 Jul 2019 16:37:37 -0700 Subject: [PATCH 32/34] Fixed new benchmark --- asv_bench/benchmarks/io/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index e3300995b852e..0180ed267b201 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -152,7 +152,7 @@ def setup_cache(self): def mem_int(self, frames): df = frames["wide"] - wide.to_json() + df.to_json() def peakmem_int(self, frames): df = frames["int"] From 8b226a69a819dad5c67232894f85da4fcaa3dd78 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 20 Jul 2019 12:30:10 -0700 Subject: [PATCH 33/34] More comprehensive benchmarks --- asv_bench/benchmarks/io/json.py | 64 +++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 0180ed267b201..c8d5fcbb11282 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -63,10 +63,10 @@ def peakmem_read_json_lines_concat(self, index): class ToJSON(BaseIO): fname = "__test__.json" - params = ["split", "columns", "index"] + params = ["split", "columns", "index", "records", "values"] param_names = ["orient"] - def setup(self, lines_orient): + def setup(self, orient): N = 10 ** 5 ncols = 5 index = date_range("20000101", periods=N, freq="H") @@ -126,19 +126,69 @@ def time_float_int(self, orient): def time_float_int_str(self, orient): self.df_int_float_str.to_json(self.fname, orient=orient) - def time_floats_with_int_idex_lines(self, orient): + +class ToJSONLines(BaseIO): + + fname = "__test__.json" + + def setup(self): + N = 10 ** 5 + ncols = 5 + index = date_range("20000101", periods=N, freq="H") + timedeltas = timedelta_range(start=1, periods=N, freq="s") + datetimes = date_range(start=1, periods=N, freq="s") + ints = np.random.randint(100000000, size=N) + floats = np.random.randn(N) + strings = tm.makeStringIndex(N) + self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) + self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) + self.df_td_int_ts = DataFrame( + { + "td_1": timedeltas, + "td_2": timedeltas, + "int_1": ints, + "int_2": ints, + "ts_1": datetimes, + "ts_2": datetimes, + }, + index=index, + ) + self.df_int_floats = DataFrame( + { + "int_1": ints, + "int_2": ints, + "int_3": ints, + "float_1": floats, + "float_2": floats, + "float_3": floats, + }, + index=index, + ) + self.df_int_float_str = DataFrame( + { + "int_1": ints, + "int_2": ints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) + + def time_floats_with_int_index_lines(self): self.df.to_json(self.fname, orient="records", lines=True) - def time_floats_with_dt_index_lines(self, orient): + def time_floats_with_dt_index_lines(self): self.df_date_idx.to_json(self.fname, orient="records", lines=True) - def time_delta_int_tstamp_lines(self, orient): + def time_delta_int_tstamp_lines(self): self.df_td_int_ts.to_json(self.fname, orient="records", lines=True) - def time_float_int_lines(self, orient): + def time_float_int_lines(self): self.df_int_floats.to_json(self.fname, orient="records", lines=True) - def time_float_int_str_lines(self, orient): + def time_float_int_str_lines(self): self.df_int_float_str.to_json(self.fname, orient="records", lines=True) From 82739bee8ef71e49e006b9bbc02b883c97a225e3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 20 Jul 2019 14:45:05 -0700 Subject: [PATCH 34/34] Passed VALUES directly through to numpy --- pandas/_libs/src/ujson/python/objToJSON.c | 39 +++++++++++------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 3695e54ed7817..a860969d759a5 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1079,9 +1079,12 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { * VALUES: [[value, value, ...], [value, value, ...], ...] * * The context of serialization here is dependent upon the orient. - * RECORDS, and VALUES orients would make the context of serialization here + * RECORDS, would make the context of serialization here * JT_ARRAY (essentially a sequence we iterate over) whereas the other orients * require a JT_OBJECT context (whereby we extract keys and values from the DataFrame). + * + * VALUES orients actually shouldn't pass through here at all and can be dispatched + * directly to the NumPy array serialization, since they don't encode labels. */ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); @@ -1101,8 +1104,6 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } - - return; // always return if modifying outputFormat } else { char *method; if (enc->outputFormat == COLUMNS) @@ -1125,10 +1126,10 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { enc->outputFormat = INDEX; } - return; } PRINTMARK(); + return; } /* @@ -1171,23 +1172,11 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { } } else { - if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { - // TODO: align extension usage of itemName and cStr - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - } - } - PyObject *tmp = PyIter_Next(GET_TC(tc)->iterator); if (tmp == 0) return 0; GET_TC(tc)->itemValue = PySequence_GetItem(tmp, 1); - - if (enc->originalOutputFormat == COLUMNS || enc->originalOutputFormat == INDEX) { - GET_TC(tc)->itemName = PySequence_GetItem(tmp, 0); - } - Py_DECREF(tmp); } @@ -1768,7 +1757,20 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetName = NpyArr_iterGetName; return; } else if (PyObject_TypeCheck(obj, cls_dataframe)) { - if (enc->outputFormat == RECORDS || enc->outputFormat == VALUES) + // VALUES doesn't encode labels, so can treat as numpy array + if (enc->outputFormat == VALUES) { + pc->newObj = get_values(obj); + if (!pc->newObj) { + goto INVALID; + } + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + return; + } + else if (enc->outputFormat == RECORDS) tc->type = JT_ARRAY; else tc->type = JT_OBJECT; @@ -1777,9 +1779,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { // but there's not really a need to have both columnLabels and rowLabels // anyway; subsequent refactor should just make these labels if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - PRINTMARK(); - tc->type = JT_OBJECT; - char *attr; if (enc->outputFormat == INDEX) attr = "index";