From e31f8397519281ebc356e03c78394cca9e7406bb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 11 May 2013 18:34:32 -0700 Subject: [PATCH 01/10] ENH: pull pandasjson back into pandas --- pandas/core/frame.py | 100 ++ pandas/core/series.py | 82 ++ pandas/io/tests/test_json/test_pandas.py | 240 +++++ pandas/io/tests/test_json/test_ujson.py | 1230 ++++++++++++++++++++++ pandas/src/ujson/lib/ultrajson.h | 298 ++++++ pandas/src/ujson/python/py_defines.h | 15 + pandas/src/ujson/python/version.h | 1 + setup.py | 21 + 8 files changed, 1987 insertions(+) create mode 100644 pandas/io/tests/test_json/test_pandas.py create mode 100644 pandas/io/tests/test_json/test_ujson.py create mode 100644 pandas/src/ujson/lib/ultrajson.h create mode 100644 pandas/src/ujson/python/py_defines.h create mode 100644 pandas/src/ujson/python/version.h diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c0a2843370f4..2925bb3e3b73a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5593,6 +5593,106 @@ def mask(self, cond): """ return self.where(~cond, NA) + +@classmethod +def from_json(cls, json, orient="columns", dtype=None, numpy=True): + """ + Convert JSON string to DataFrame + + Parameters + ---------- + json : The JSON string to parse. + orient : {'split', 'records', 'index', 'columns', 'values'}, + default 'columns' + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + dtype : dtype of the resulting DataFrame + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : DataFrame + """ + from pandas.json import loads + + df = None + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "columns": + args = loads(json, dtype=dtype, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + df = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + df = DataFrame(**decoded) + elif orient == "values": + df = DataFrame(loads(json, dtype=dtype, numpy=True)) + else: + df = DataFrame(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + except ValueError: + numpy = False + if not numpy: + if orient == "columns": + df = DataFrame(loads(json), dtype=dtype) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + df = DataFrame(dtype=dtype, **decoded) + elif orient == "index": + df = DataFrame(loads(json), dtype=dtype).T + else: + df = DataFrame(loads(json), dtype=dtype) + + return df +DataFrame.from_json = from_json + + +def to_json(self, orient="columns", double_precision=10, + force_ascii=True): + """ + Convert DataFrame to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index', 'columns', 'values'}, + default 'columns' + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas.json import dumps + return dumps(self, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) +DataFrame.to_json = to_json + + _EMPTY_SERIES = Series([]) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3a7a7d0f49b66..9147e64f5b11a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3298,6 +3298,88 @@ def str(self): from pandas.core.strings import StringMethods return StringMethods(self) + +@classmethod +def from_json(cls, json, orient="index", dtype=None, numpy=True): + """ + Convert JSON string to Series + + Parameters + ---------- + json : The JSON string to parse. + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + dtype : dtype of the resulting Series + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : Series + """ + from pandas.json import loads + s = None + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + s = Series(**decoded) + elif orient == "columns" or orient == "index": + s = Series(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + else: + s = Series(loads(json, dtype=dtype, numpy=True)) + except ValueError: + numpy = False + if not numpy: + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + s = Series(dtype=dtype, **decoded) + else: + s = Series(loads(json), dtype=dtype) + + return s +Series.from_json = from_json + +def to_json(self, orient="index", double_precision=10, force_ascii=True): + """ + Convert Series to a JSON string + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas.json import dumps + return dumps(self, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) +Series.to_json = to_json + + _INDEX_TYPES = ndarray, Index, list, tuple #------------------------------------------------------------------------------ diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py new file mode 100644 index 0000000000000..506aa382487d6 --- /dev/null +++ b/pandas/io/tests/test_json/test_pandas.py @@ -0,0 +1,240 @@ +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta +from StringIO import StringIO +import cPickle as pickle +import operator +import os +import unittest + +import numpy as np + +from pandas import Series, DataFrame, DatetimeIndex +import pandas as pd + +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal) +import pandas.util.testing as tm + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() + +_frame = DataFrame(_seriesd) +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) +_intframe = DataFrame(dict((k, v.astype(int)) + for k, v in _seriesd.iteritems())) + +_tsframe = DataFrame(_tsd) + +_mixed_frame = _frame.copy() + + +class TestPandasObjects(unittest.TestCase): + + def setUp(self): + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.objSeries = tm.makeObjectSeries() + self.objSeries.name = 'objects' + + self.empty_series = Series([], index=[]) + self.empty_frame = DataFrame({}) + + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.intframe = _intframe.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + def test_frame_from_json_to_json(self): + + def _check_orient(df, orient, dtype=None, numpy=True): + df = df.sort() + dfjson = df.to_json(orient=orient) + unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy) + unser = unser.sort() + if df.index.dtype.type == np.datetime64: + unser.index = DatetimeIndex(unser.index.values.astype('i8')) + if orient == "records": + # index is not captured in this orientation + assert_almost_equal(df.values, unser.values) + self.assert_(df.columns.equals(unser.columns)) + elif orient == "values": + # index and cols are not captured in this orientation + assert_almost_equal(df.values, unser.values) + elif orient == "split": + # index and col labels might not be strings + unser.index = [str(i) for i in unser.index] + unser.columns = [str(i) for i in unser.columns] + unser = unser.sort() + assert_almost_equal(df.values, unser.values) + else: + assert_frame_equal(df, unser) + + def _check_all_orients(df, dtype=None): + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, numpy=False) + _check_orient(df, "records", dtype=dtype, numpy=False) + _check_orient(df, "split", dtype=dtype, numpy=False) + _check_orient(df, "index", dtype=dtype, numpy=False) + _check_orient(df, "values", dtype=dtype, numpy=False) + + # basic + _check_all_orients(self.frame) + self.assertEqual(self.frame.to_json(), + self.frame.to_json(orient="columns")) + + _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + + # big one + # index and columns are strings as all unserialised JSON object keys + # are assumed to be strings + biggie = DataFrame(np.zeros((200, 4)), + columns=[str(i) for i in range(4)], + index=[str(i) for i in range(200)]) + _check_all_orients(biggie) + + # dtypes + _check_all_orients(DataFrame(biggie, dtype=np.float64), + dtype=np.float64) + _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) + _check_all_orients(DataFrame(biggie, dtype='= 3 + else partial(json.dumps, encoding="utf-8")) + +class UltraJSONTests(TestCase): + def test_encodeDictWithUnicodeKeys(self): + input = { u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1" } + output = ujson.encode(input) + + input = { u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1" } + output = ujson.encode(input) + + pass + + def test_encodeDoubleConversion(self): + input = math.pi + output = ujson.encode(input) + self.assertEquals(round(input, 5), round(json.loads(output), 5)) + self.assertEquals(round(input, 5), round(ujson.decode(output), 5)) + + def test_encodeWithDecimal(self): + input = 1.0 + output = ujson.encode(input) + self.assertEquals(output, "1.0") + + def test_encodeDoubleNegConversion(self): + input = -math.pi + output = ujson.encode(input) + self.assertEquals(round(input, 5), round(json.loads(output), 5)) + self.assertEquals(round(input, 5), round(ujson.decode(output), 5)) + + def test_encodeArrayOfNestedArrays(self): + input = [[[[]]]] * 20 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + #self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + input = np.array(input) + assert_array_equal(input, ujson.decode(output, numpy=True, dtype=input.dtype)) + + def test_encodeArrayOfDoubles(self): + input = [ 31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + #self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + + def test_doublePrecisionTest(self): + input = 30.012345678901234 + output = ujson.encode(input, double_precision = 15) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + + output = ujson.encode(input, double_precision = 9) + self.assertEquals(round(input, 9), json.loads(output)) + self.assertEquals(round(input, 9), ujson.decode(output)) + + output = ujson.encode(input, double_precision = 3) + self.assertEquals(round(input, 3), json.loads(output)) + self.assertEquals(round(input, 3), ujson.decode(output)) + + output = ujson.encode(input) + self.assertEquals(round(input, 5), json.loads(output)) + self.assertEquals(round(input, 5), ujson.decode(output)) + + def test_invalidDoublePrecision(self): + input = 30.12345678901234567890 + output = ujson.encode(input, double_precision = 20) + # should snap to the max, which is 15 + self.assertEquals(round(input, 15), json.loads(output)) + self.assertEquals(round(input, 15), ujson.decode(output)) + + output = ujson.encode(input, double_precision = -1) + # also should snap to the max, which is 15 + self.assertEquals(round(input, 15), json.loads(output)) + self.assertEquals(round(input, 15), ujson.decode(output)) + + # will throw typeError + self.assertRaises(TypeError, ujson.encode, input, double_precision = '9') + # will throw typeError + self.assertRaises(TypeError, ujson.encode, input, double_precision = None) + + + def test_encodeStringConversion(self): + input = "A string \\ / \b \f \n \r \t" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, '"A string \\\\ \\/ \\b \\f \\n \\r \\t"') + self.assertEquals(input, ujson.decode(output)) + pass + + def test_decodeUnicodeConversion(self): + pass + + def test_encodeUnicodeConversion1(self): + input = "Räksmörgås اسامة بن محمد بن عوض بن لادن" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeControlEscaping(self): + input = "\x19" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(input, dec) + self.assertEquals(enc, json_unicode(input)) + + + def test_encodeUnicodeConversion2(self): + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicodeSurrogatePair(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf0\x90\x8d\x86" + enc = ujson.encode(input) + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicode4BytesUTF8(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf0\x91\x80\xb0TRAILINGNORMAL" + enc = ujson.encode(input) + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicode4BytesUTF8Highest(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL" + enc = ujson.encode(input) + + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + + def test_encodeArrayInArray(self): + input = [[[[]]]] + output = ujson.encode(input) + + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + pass + + def test_encodeIntConversion(self): + input = 31337 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeIntNegConversion(self): + input = -31337 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + + def test_encodeLongNegConversion(self): + input = -9223372036854775808 + output = ujson.encode(input) + + outputjson = json.loads(output) + outputujson = ujson.decode(output) + + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeListConversion(self): + input = [ 1, 2, 3, 4 ] + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + pass + + def test_encodeDictConversion(self): + input = { "k1": 1, "k2": 2, "k3": 3, "k4": 4 } + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeNoneConversion(self): + input = None + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeTrueConversion(self): + input = True + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeFalseConversion(self): + input = False + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + # def test_encodeDatetimeConversion(self): + # ts = time.time() + # input = datetime.datetime.fromtimestamp(ts) + # output = ujson.encode(input) + # expected = calendar.timegm(input.utctimetuple()) + # self.assertEquals(int(expected), json.loads(output)) + # self.assertEquals(int(expected), ujson.decode(output)) + # pass + + # def test_encodeDateConversion(self): + # ts = time.time() + # input = datetime.date.fromtimestamp(ts) + + # output = ujson.encode(input) + # tup = ( input.year, input.month, input.day, 0, 0, 0 ) + + # expected = calendar.timegm(tup) + # self.assertEquals(int(expected), json.loads(output)) + # self.assertEquals(int(expected), ujson.decode(output)) + + def test_datetime_nanosecond_unit(self): + from datetime import datetime + from pandas.lib import Timestamp + + val = datetime.now() + stamp = Timestamp(val) + + roundtrip = ujson.decode(ujson.encode(val)) + self.assert_(roundtrip == stamp.value) + + def test_encodeToUTF8(self): + _skip_if_python_ver(2, 5) + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input, ensure_ascii=False) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input, ensure_ascii=False)) + self.assertEquals(dec, json.loads(enc)) + + def test_decodeFromUnicode(self): + input = u"{\"obj\": 31337}" + dec1 = ujson.decode(input) + dec2 = ujson.decode(str(input)) + self.assertEquals(dec1, dec2) + + def test_encodeRecursionMax(self): + # 8 is the max recursion depth + + class O2: + member = 0 + pass + + class O1: + member = 0 + pass + + input = O1() + input.member = O2() + input.member.member = input + + try: + output = ujson.encode(input) + assert False, "Expected overflow exception" + except(OverflowError): + pass + + def test_encodeDoubleNan(self): + input = np.nan + assert ujson.encode(input) == 'null', "Expected null" + + def test_encodeDoubleInf(self): + input = np.inf + assert ujson.encode(input) == 'null', "Expected null" + + def test_encodeDoubleNegInf(self): + input = -np.inf + assert ujson.encode(input) == 'null', "Expected null" + + + def test_decodeJibberish(self): + input = "fdsa sda v9sa fdsa" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenArrayStart(self): + input = "[" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenObjectStart(self): + input = "{" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenArrayEnd(self): + input = "]" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenObjectEnd(self): + input = "}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringUnterminated(self): + input = "\"TESTING" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringUntermEscapeSequence(self): + input = "\"TESTING\\\"" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringBadEscape(self): + input = "\"TESTING\\\"" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeTrueBroken(self): + input = "tru" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeFalseBroken(self): + input = "fa" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeNullBroken(self): + input = "n" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + + def test_decodeBrokenDictKeyTypeLeakTest(self): + input = '{{1337:""}}' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError),e: + continue + + assert False, "Wrong exception" + + def test_decodeBrokenDictLeakTest(self): + input = '{{"key":"}' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + continue + + assert False, "Wrong exception" + + def test_decodeBrokenListLeakTest(self): + input = '[[[true' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + continue + + assert False, "Wrong exception" + + def test_decodeDictWithNoKey(self): + input = "{{{{31337}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeDictWithNoColonOrValue(self): + input = "{{{{\"key\"}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeDictWithNoValue(self): + input = "{{{{\"key\":}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeNumericIntPos(self): + input = "31337" + self.assertEquals (31337, ujson.decode(input)) + + def test_decodeNumericIntNeg(self): + input = "-31337" + self.assertEquals (-31337, ujson.decode(input)) + + def test_encodeUnicode4BytesUTF8Fail(self): + _skip_if_python_ver(3) + input = "\xfd\xbf\xbf\xbf\xbf\xbf" + try: + enc = ujson.encode(input) + assert False, "Expected exception" + except OverflowError: + pass + + def test_encodeNullCharacter(self): + input = "31337 \x00 1337" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + + input = "\x00" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + + self.assertEquals('" \\u0000\\r\\n "', ujson.dumps(u" \u0000\r\n ")) + pass + + def test_decodeNullCharacter(self): + input = "\"31337 \\u0000 31337\"" + self.assertEquals(ujson.decode(input), json.loads(input)) + + + def test_encodeListLongConversion(self): + input = [9223372036854775807, 9223372036854775807, 9223372036854775807, + 9223372036854775807, 9223372036854775807, 9223372036854775807 ] + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True, + dtype=np.int64)) + pass + + def test_encodeLongConversion(self): + input = 9223372036854775807 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_numericIntExp(self): + input = "1337E40" + output = ujson.decode(input) + self.assertEquals(output, json.loads(input)) + + def test_numericIntFrcExp(self): + input = "1.337E40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpEPLUS(self): + input = "1337E+40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpePLUS(self): + input = "1.337e+40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpE(self): + input = "1337E40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpe(self): + input = "1337e40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpEMinus(self): + input = "1.337E-4" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpeMinus(self): + input = "1.337e-4" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_dumpToFile(self): + f = StringIO.StringIO() + ujson.dump([1, 2, 3], f) + self.assertEquals("[1,2,3]", f.getvalue()) + + def test_dumpToFileLikeObject(self): + class filelike: + def __init__(self): + self.bytes = '' + def write(self, bytes): + self.bytes += bytes + f = filelike() + ujson.dump([1, 2, 3], f) + self.assertEquals("[1,2,3]", f.bytes) + + def test_dumpFileArgsError(self): + try: + ujson.dump([], '') + except TypeError: + pass + else: + assert False, 'expected TypeError' + + def test_loadFile(self): + f = StringIO.StringIO("[1,2,3,4]") + self.assertEquals([1, 2, 3, 4], ujson.load(f)) + f = StringIO.StringIO("[1,2,3,4]") + assert_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + + def test_loadFileLikeObject(self): + class filelike: + def read(self): + try: + self.end + except AttributeError: + self.end = True + return "[1,2,3,4]" + f = filelike() + self.assertEquals([1, 2, 3, 4], ujson.load(f)) + f = filelike() + assert_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + + def test_loadFileArgsError(self): + try: + ujson.load("[]") + except TypeError: + pass + else: + assert False, "expected TypeError" + + def test_version(self): + assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \ + "ujson.__version__ must be a string like '1.4.0'" + + def test_encodeNumericOverflow(self): + try: + ujson.encode(12839128391289382193812939) + except OverflowError: + pass + else: + assert False, "expected OverflowError" + + def test_encodeNumericOverflowNested(self): + for n in xrange(0, 100): + class Nested: + x = 12839128391289382193812939 + + nested = Nested() + + try: + ujson.encode(nested) + except OverflowError: + pass + else: + assert False, "expected OverflowError" + + def test_decodeNumberWith32bitSignBit(self): + #Test that numbers that fit within 32 bits but would have the + # sign bit set (2**31 <= x < 2**32) are decoded properly. + boundary1 = 2**31 + boundary2 = 2**32 + docs = ( + '{"id": 3590016419}', + '{"id": %s}' % 2**31, + '{"id": %s}' % 2**32, + '{"id": %s}' % ((2**32)-1), + ) + results = (3590016419, 2**31, 2**32, 2**32-1) + for doc,result in zip(docs, results): + self.assertEqual(ujson.decode(doc)['id'], result) + + def test_encodeBigEscape(self): + for x in xrange(10): + if py3compat.PY3: + base = '\u00e5'.encode('utf-8') + else: + base = "\xc3\xa5" + input = base * 1024 * 1024 * 2 + output = ujson.encode(input) + + def test_decodeBigEscape(self): + for x in xrange(10): + if py3compat.PY3: + base = '\u00e5'.encode('utf-8') + else: + base = "\xc3\xa5" + quote = py3compat.str_to_bytes("\"") + input = quote + (base * 1024 * 1024 * 2) + quote + output = ujson.decode(input) + + def test_toDict(self): + d = {u"key": 31337} + + class DictTest: + def toDict(self): + return d + + o = DictTest() + output = ujson.encode(o) + dec = ujson.decode(output) + self.assertEquals(dec, d) + + +class NumpyJSONTests(TestCase): + + def testBool(self): + b = np.bool(True) + self.assertEqual(ujson.decode(ujson.encode(b)), b) + + def testBoolArray(self): + inpt = np.array([True, False, True, True, False, True, False , False], + dtype=np.bool) + outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) + assert_array_equal(inpt, outp) + + def testInt(self): + num = np.int(2562010) + self.assertEqual(np.int(ujson.decode(ujson.encode(num))), num) + + num = np.int8(127) + self.assertEqual(np.int8(ujson.decode(ujson.encode(num))), num) + + num = np.int16(2562010) + self.assertEqual(np.int16(ujson.decode(ujson.encode(num))), num) + + num = np.int32(2562010) + self.assertEqual(np.int32(ujson.decode(ujson.encode(num))), num) + + num = np.int64(2562010) + self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) + + num = np.uint8(255) + self.assertEqual(np.uint8(ujson.decode(ujson.encode(num))), num) + + num = np.uint16(2562010) + self.assertEqual(np.uint16(ujson.decode(ujson.encode(num))), num) + + num = np.uint32(2562010) + self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) + + num = np.uint64(2562010) + self.assertEqual(np.uint64(ujson.decode(ujson.encode(num))), num) + + def testIntArray(self): + arr = np.arange(100, dtype=np.int) + dtypes = (np.int, np.int8, np.int16, np.int32, np.int64, + np.uint, np.uint8, np.uint16, np.uint32, np.uint64) + for dtype in dtypes: + inpt = arr.astype(dtype) + outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=dtype) + assert_array_equal(inpt, outp) + + def testIntMax(self): + num = np.int(np.iinfo(np.int).max) + self.assertEqual(np.int(ujson.decode(ujson.encode(num))), num) + + num = np.int8(np.iinfo(np.int8).max) + self.assertEqual(np.int8(ujson.decode(ujson.encode(num))), num) + + num = np.int16(np.iinfo(np.int16).max) + self.assertEqual(np.int16(ujson.decode(ujson.encode(num))), num) + + num = np.int32(np.iinfo(np.int32).max) + self.assertEqual(np.int32(ujson.decode(ujson.encode(num))), num) + + num = np.uint8(np.iinfo(np.uint8).max) + self.assertEqual(np.uint8(ujson.decode(ujson.encode(num))), num) + + num = np.uint16(np.iinfo(np.uint16).max) + self.assertEqual(np.uint16(ujson.decode(ujson.encode(num))), num) + + num = np.uint32(np.iinfo(np.uint32).max) + self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) + + if platform.architecture()[0] != '32bit': + num = np.int64(np.iinfo(np.int64).max) + self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) + + # uint64 max will always overflow as it's encoded to signed + num = np.uint64(np.iinfo(np.int64).max) + self.assertEqual(np.uint64(ujson.decode(ujson.encode(num))), num) + + def testFloat(self): + num = np.float(256.2013) + self.assertEqual(np.float(ujson.decode(ujson.encode(num))), num) + + num = np.float32(256.2013) + self.assertEqual(np.float32(ujson.decode(ujson.encode(num))), num) + + num = np.float64(256.2013) + self.assertEqual(np.float64(ujson.decode(ujson.encode(num))), num) + + def testFloatArray(self): + arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) + dtypes = (np.float, np.float32, np.float64) + + for dtype in dtypes: + inpt = arr.astype(dtype) + outp = np.array(ujson.decode(ujson.encode(inpt, double_precision=15)), dtype=dtype) + assert_array_almost_equal_nulp(inpt, outp) + + def testFloatMax(self): + num = np.float(np.finfo(np.float).max/10) + assert_approx_equal(np.float(ujson.decode(ujson.encode(num))), num, 15) + + num = np.float32(np.finfo(np.float32).max/10) + assert_approx_equal(np.float32(ujson.decode(ujson.encode(num))), num, 15) + + num = np.float64(np.finfo(np.float64).max/10) + assert_approx_equal(np.float64(ujson.decode(ujson.encode(num))), num, 15) + + def testArrays(self): + arr = np.arange(100); + + arr = arr.reshape((10, 10)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = arr.reshape((5, 5, 4)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = arr.reshape((100, 1)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = np.arange(96); + arr = arr.reshape((2, 2, 2, 2, 3, 2)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + l = ['a', list(), dict(), dict(), list(), + 42, 97.8, ['a', 'b'], {'key': 'val'}] + arr = np.array(l) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + + arr = np.arange(100.202, 200.202, 1, dtype=np.float32); + arr = arr.reshape((5, 5, 4)) + outp = np.array(ujson.decode(ujson.encode(arr)), dtype=np.float32) + assert_array_almost_equal_nulp(arr, outp) + outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) + assert_array_almost_equal_nulp(arr, outp) + + def testArrayNumpyExcept(self): + + input = ujson.dumps([42, {}, 'a']) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(TypeError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps(['a', 'b', [], 'c']) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([['a'], 42]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([42, ['a'], 42]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{}, []]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([42, None]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(TypeError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{'a': 'b'}]) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps({'a': {'b': {'c': 42}}}) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{'a': 42, 'b': 23}, {'c': 17}]) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + def testArrayNumpyLabelled(self): + input = {'a': []} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + self.assertTrue((np.empty((1, 0)) == output[0]).all()) + self.assertTrue((np.array(['a']) == output[1]).all()) + self.assertTrue(output[2] is None) + + input = [{'a': 42}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + self.assertTrue((np.array([42]) == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u'a']) == output[2]).all()) + + input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u'a', 'b']) == output[2]).all()) + + + input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue((np.array(['1','2','3']) == output[1]).all()) + self.assertTrue((np.array(['a', 'b']) == output[2]).all()) + +class PandasJSONTests(TestCase): + + def testDataFrame(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(ujson.decode(ujson.encode(df))) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"))) + outp = DataFrame(**dec) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="records"))) + outp.index = df.index + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="values"))) + outp.index = df.index + self.assertTrue((df.values == outp.values).all()) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"))) + self.assertTrue((df.transpose() == outp).values.all()) + assert_array_equal(df.transpose().columns, outp.columns) + assert_array_equal(df.transpose().index, outp.index) + + + def testDataFrameNumpy(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(ujson.decode(ujson.encode(df), numpy=True)) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"), + numpy=True)) + outp = DataFrame(**dec) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"), numpy=True)) + self.assertTrue((df.transpose() == outp).values.all()) + assert_array_equal(df.transpose().columns, outp.columns) + assert_array_equal(df.transpose().index, outp.index) + + def testDataFrameNested(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + nested = {'df1': df, 'df2': df.copy()} + + exp = {'df1': ujson.decode(ujson.encode(df)), + 'df2': ujson.decode(ujson.encode(df))} + self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="index")), + 'df2': ujson.decode(ujson.encode(df, orient="index"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="records")), + 'df2': ujson.decode(ujson.encode(df, orient="records"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="values")), + 'df2': ujson.decode(ujson.encode(df, orient="values"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="split")), + 'df2': ujson.decode(ujson.encode(df, orient="split"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + + def testDataFrameNumpyLabelled(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(*ujson.decode(ujson.encode(df), numpy=True, labelled=True)) + self.assertTrue((df.T == outp).values.all()) + assert_array_equal(df.T.columns, outp.columns) + assert_array_equal(df.T.index, outp.index) + + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="records"), numpy=True, labelled=True)) + outp.index = df.index + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="index"), numpy=True, labelled=True)) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + def testSeries(self): + s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) + s.sort() + + # column indexed + outp = Series(ujson.decode(ujson.encode(s))) + outp.sort() + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s), numpy=True)) + outp.sort() + self.assertTrue((s == outp).values.all()) + + dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"))) + outp = Series(**dec) + self.assertTrue((s == outp).values.all()) + self.assertTrue(s.name == outp.name) + + dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"), + numpy=True)) + outp = Series(**dec) + self.assertTrue((s == outp).values.all()) + self.assertTrue(s.name == outp.name) + + outp = Series(ujson.decode(ujson.encode(s, orient="records"), numpy=True)) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="records"))) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="values"), numpy=True)) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="values"))) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="index"))) + outp.sort() + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="index"), numpy=True)) + outp.sort() + self.assertTrue((s == outp).values.all()) + + def testSeriesNested(self): + s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) + s.sort() + + nested = {'s1': s, 's2': s.copy()} + + exp = {'s1': ujson.decode(ujson.encode(s)), + 's2': ujson.decode(ujson.encode(s))} + self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="split")), + 's2': ujson.decode(ujson.encode(s, orient="split"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="records")), + 's2': ujson.decode(ujson.encode(s, orient="records"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="values")), + 's2': ujson.decode(ujson.encode(s, orient="values"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="index")), + 's2': ujson.decode(ujson.encode(s, orient="index"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + + def testIndex(self): + i = Index([23, 45, 18, 98, 43, 11], name="index") + + # column indexed + outp = Index(ujson.decode(ujson.encode(i))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i), numpy=True)) + self.assert_(i.equals(outp)) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"))) + outp = Index(**dec) + self.assert_(i.equals(outp)) + self.assertTrue(i.name == outp.name) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), + numpy=True)) + outp = Index(**dec) + self.assert_(i.equals(outp)) + self.assertTrue(i.name == outp.name) + + outp = Index(ujson.decode(ujson.encode(i, orient="values"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="values"), numpy=True)) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="records"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="records"), numpy=True)) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="index"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="index"), numpy=True)) + self.assert_(i.equals(outp)) + + def test_datetimeindex(self): + from pandas.tseries.index import date_range, DatetimeIndex + + rng = date_range('1/1/2000', periods=20) + + encoded = ujson.encode(rng) + decoded = DatetimeIndex(np.array(ujson.decode(encoded))) + + self.assert_(rng.equals(decoded)) + + ts = Series(np.random.randn(len(rng)), index=rng) + decoded = Series(ujson.decode(ujson.encode(ts))) + idx_values = decoded.index.values.astype(np.int64) + decoded.index = DatetimeIndex(idx_values) + tm.assert_series_equal(np.round(ts, 5), decoded) + +""" +def test_decodeNumericIntFrcOverflow(self): +input = "X.Y" +raise NotImplementedError("Implement this test!") + + +def test_decodeStringUnicodeEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUnicodeBrokenEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUnicodeInvalidEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUTF8(self): +input = "someutfcharacters" +raise NotImplementedError("Implement this test!") + + + +""" + +def _clean_dict(d): + return dict((str(k), v) for k, v in d.iteritems()) + +if __name__ == '__main__': + # unittest.main() + import nose + # nose.runmodule(argv=[__file__,'-vvs','-x', '--ipdb-failure'], + # exit=False) + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h new file mode 100644 index 0000000000000..eae665f00f03e --- /dev/null +++ b/pandas/src/ujson/lib/ultrajson.h @@ -0,0 +1,298 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +/* +Ultra fast JSON encoder and decoder +Developed by Jonas Tarnstrom (jonas@esn.me). + +Encoder notes: +------------------ + +:: Cyclic references :: +Cyclic referenced objects are not detected. +Set JSONObjectEncoder.recursionMax to suitable value or make sure input object +tree doesn't have cyclic references. + +*/ + +#ifndef __ULTRAJSON_H__ +#define __ULTRAJSON_H__ + +#include +#include + +//#define JSON_DECODE_NUMERIC_AS_DOUBLE + +// Don't output any extra whitespaces when encoding +#define JSON_NO_EXTRA_WHITESPACE + +// Max decimals to encode double floating point numbers with +#ifndef JSON_DOUBLE_MAX_DECIMALS +#define JSON_DOUBLE_MAX_DECIMALS 15 +#endif + +// Max recursion depth, default for encoder +#ifndef JSON_MAX_RECURSION_DEPTH +#define JSON_MAX_RECURSION_DEPTH 1024 +#endif + +/* +Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */ +#ifndef JSON_MAX_STACK_BUFFER_SIZE +#define JSON_MAX_STACK_BUFFER_SIZE 131072 +#endif + +#ifdef _WIN32 + +typedef __int64 JSINT64; +typedef unsigned __int64 JSUINT64; + +typedef __int32 JSINT32; +typedef unsigned __int32 JSUINT32; +typedef unsigned __int8 JSUINT8; +typedef unsigned __int16 JSUTF16; +typedef unsigned __int32 JSUTF32; +typedef __int64 JSLONG; + +#define EXPORTFUNCTION __declspec(dllexport) + +#define FASTCALL_MSVC __fastcall +#define FASTCALL_ATTR +#define INLINE_PREFIX __inline + +#else + +#include +typedef int64_t JSINT64; +typedef u_int64_t JSUINT64; + +typedef int32_t JSINT32; +typedef u_int32_t JSUINT32; + +#define FASTCALL_MSVC +#define FASTCALL_ATTR __attribute__((fastcall)) +#define INLINE_PREFIX inline + +typedef u_int8_t JSUINT8; +typedef u_int16_t JSUTF16; +typedef u_int32_t JSUTF32; + +typedef int64_t JSLONG; + +#define EXPORTFUNCTION +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define __LITTLE_ENDIAN__ +#else + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __BIG_ENDIAN__ +#endif + +#endif + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) +#error "Endianess not supported" +#endif + +enum JSTYPES +{ + JT_NULL, // NULL + JT_TRUE, //boolean true + JT_FALSE, //boolean false + JT_INT, //(JSINT32 (signed 32-bit)) + JT_LONG, //(JSINT64 (signed 64-bit)) + JT_DOUBLE, //(double) + JT_UTF8, //(char 8-bit) + JT_ARRAY, // Array structure + JT_OBJECT, // Key/Value structure + JT_INVALID, // Internal, do not return nor expect +}; + +typedef void * JSOBJ; +typedef void * JSITER; + +typedef struct __JSONTypeContext +{ + int type; + void *encoder; + void *prv; +} JSONTypeContext; + +/* +Function pointer declarations, suitable for implementing UltraJSON */ +typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); +typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); +typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); +typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); +typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, size_t *outLen); +typedef void *(*JSPFN_MALLOC)(size_t size); +typedef void (*JSPFN_FREE)(void *pptr); +typedef void *(*JSPFN_REALLOC)(void *base, size_t size); + +typedef struct __JSONObjectEncoder +{ + void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); + void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); + double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + + /* + Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) + Implementor should setup iteration state in ti->prv + */ + JSPFN_ITERBEGIN iterBegin; + + /* + Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items. + Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this + */ + JSPFN_ITERNEXT iterNext; + + /* + Ends the iteration of an iteratable object. + Any iteration state stored in ti->prv can be freed here + */ + JSPFN_ITEREND iterEnd; + + /* + Returns a reference to the value object of an iterator + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETVALUE iterGetValue; + + /* + Return name of iterator. + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETNAME iterGetName; + + /* + Release a value as indicated by setting ti->release = 1 in the previous getValue call. + The ti->prv array should contain the necessary context to release the value + */ + void (*releaseObject)(JSOBJ obj); + + /* Library functions + Set to NULL to use STDLIB malloc,realloc,free */ + JSPFN_MALLOC malloc; + JSPFN_REALLOC realloc; + JSPFN_FREE free; + + /* + Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/ + int recursionMax; + + /* + Configuration for max decimals of double floating poiunt numbers to encode (0-9) */ + int doublePrecision; + + /* + If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */ + int forceASCII; + + + /* + Set to an error message if error occured */ + const char *errorMsg; + JSOBJ errorObj; + + /* Buffer stuff */ + char *start; + char *offset; + char *end; + int heap; + int level; + +} JSONObjectEncoder; + + +/* +Encode an object structure into JSON. + +Arguments: +obj - An anonymous type representing the object +enc - Function definitions for querying JSOBJ type +buffer - Preallocated buffer to store result in. If NULL function allocates own buffer +cbBuffer - Length of buffer (ignored if buffer is NULL) + +Returns: +Encoded JSON object as a null terminated char string. + +NOTE: +If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer. +Life cycle of the provided buffer must still be handled by caller. + +If the return value doesn't equal the specified buffer caller must release the memory using +JSONObjectEncoder.free or free() as specified when calling this function. +*/ +EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer); + + + +typedef struct __JSONObjectDecoder +{ + JSOBJ (*newString)(wchar_t *start, wchar_t *end); + int (*objectAddKey)(JSOBJ obj, JSOBJ name, JSOBJ value); + int (*arrayAddItem)(JSOBJ obj, JSOBJ value); + JSOBJ (*newTrue)(void); + JSOBJ (*newFalse)(void); + JSOBJ (*newNull)(void); + JSOBJ (*newObject)(void *decoder); + JSOBJ (*endObject)(JSOBJ obj); + JSOBJ (*newArray)(void *decoder); + JSOBJ (*endArray)(JSOBJ obj); + JSOBJ (*newInt)(JSINT32 value); + JSOBJ (*newLong)(JSINT64 value); + JSOBJ (*newDouble)(double value); + void (*releaseObject)(JSOBJ obj, void *decoder); + JSPFN_MALLOC malloc; + JSPFN_FREE free; + JSPFN_REALLOC realloc; + + char *errorStr; + char *errorOffset; + + + +} JSONObjectDecoder; + +EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); + +#endif diff --git a/pandas/src/ujson/python/py_defines.h b/pandas/src/ujson/python/py_defines.h new file mode 100644 index 0000000000000..1544c2e3cf34d --- /dev/null +++ b/pandas/src/ujson/python/py_defines.h @@ -0,0 +1,15 @@ +#include + +#if PY_MAJOR_VERSION >= 3 + +#define PyInt_Check PyLong_Check +#define PyInt_AS_LONG PyLong_AsLong +#define PyInt_FromLong PyLong_FromLong + +#define PyString_Check PyBytes_Check +#define PyString_GET_SIZE PyBytes_GET_SIZE +#define PyString_AS_STRING PyBytes_AS_STRING + +#define PyString_FromString PyUnicode_FromString + +#endif diff --git a/pandas/src/ujson/python/version.h b/pandas/src/ujson/python/version.h new file mode 100644 index 0000000000000..9449441411192 --- /dev/null +++ b/pandas/src/ujson/python/version.h @@ -0,0 +1 @@ +#define UJSON_VERSION "1.18" diff --git a/setup.py b/setup.py index 030584ba509d3..1cc666c87404b 100755 --- a/setup.py +++ b/setup.py @@ -250,6 +250,11 @@ def initialize_options(self): for f in files: if f in self._clean_exclude: continue + + # XXX + if 'ujson' in f: + continue + if os.path.splitext(f)[-1] in ('.pyc', '.so', '.o', '.pyo', '.pyd', '.c', '.orig'): @@ -457,6 +462,21 @@ def pxd(name): root, _ = os.path.splitext(ext.sources[0]) ext.sources[0] = root + suffix +ujson_ext = Extension('pandas.json', + depends=['pandas/src/ujson/lib/ultrajson.h'], + sources=['pandas/src/ujson/python/ujson.c', + 'pandas/src/ujson/python/objToJSON.c', + 'pandas/src/ujson/python/JSONtoObj.c', + 'pandas/src/ujson/lib/ultrajsonenc.c', + 'pandas/src/ujson/lib/ultrajsondec.c', + 'pandas/src/datetime/np_datetime.c', + 'pandas/src/datetime/np_datetime_strings.c'], + include_dirs=['pandas/src/ujson/python', + 'pandas/src/ujson/lib'] + common_include) + + +extensions.append(ujson_ext) + if _have_setuptools: setuptools_kwargs["test_suite"] = "nose.collector" @@ -485,6 +505,7 @@ def pxd(name): 'pandas.tseries', 'pandas.tseries.tests', 'pandas.io.tests', + 'pandas.io.tests.test_json', 'pandas.stats.tests', ], package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', From 8327c5b21586bde16393aed895be3f5630c1233b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 11 May 2013 18:39:55 -0700 Subject: [PATCH 02/10] DOC: add ultrajson license --- LICENSES/ULTRAJSON_LICENSE | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 LICENSES/ULTRAJSON_LICENSE diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE new file mode 100644 index 0000000000000..defca46e7f820 --- /dev/null +++ b/LICENSES/ULTRAJSON_LICENSE @@ -0,0 +1,34 @@ +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. \ No newline at end of file From ade5d0ffc7e752522051332f8d23aa5ba0cae55b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 12 May 2013 11:13:13 -0700 Subject: [PATCH 03/10] TST: json manip test script. and trigger travis --- scripts/json_manip.py | 421 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 421 insertions(+) create mode 100644 scripts/json_manip.py diff --git a/scripts/json_manip.py b/scripts/json_manip.py new file mode 100644 index 0000000000000..e76a99cca344a --- /dev/null +++ b/scripts/json_manip.py @@ -0,0 +1,421 @@ +""" + +Tasks +------- + +Search and transform jsonable structures, specifically to make it 'easy' to make tabular/csv output for other consumers. + +Example +~~~~~~~~~~~~~ + + *give me a list of all the fields called 'id' in this stupid, gnarly + thing* + + >>> Q('id',gnarly_data) + ['id1','id2','id3'] + + +Observations: +--------------------- + +1) 'simple data structures' exist and are common. They are tedious + to search. + +2) The DOM is another nested / treeish structure, and jQuery selector is + a good tool for that. + +3a) R, Numpy, Excel and other analysis tools want 'tabular' data. These + analyses are valuable and worth doing. + +3b) Dot/Graphviz, NetworkX, and some other analyses *like* treeish/dicty + things, and those analyses are also worth doing! + +3c) Some analyses are best done using 'one-off' and custom code in C, Python, + or another 'real' programming language. + +4) Arbitrary transforms are tedious and error prone. SQL is one solution, + XSLT is another, + +5) the XPATH/XML/XSLT family is.... not universally loved :) They are + very complete, and the completeness can make simple cases... gross. + +6) For really complicated data structures, we can write one-off code. Getting + 80% of the way is mostly okay. There will always have to be programmers + in the loop. + +7) Re-inventing SQL is probably a failure mode. So is reinventing XPATH, XSLT + and the like. Be wary of mission creep! Re-use when possible (e.g., can + we put the thing into a DOM using + +8) If the interface is good, people can improve performance later. + + +Simplifying +--------------- + + +1) Assuming 'jsonable' structures + +2) keys are strings or stringlike. Python allows any hashable to be a key. + for now, we pretend that doesn't happen. + +3) assumes most dicts are 'well behaved'. DAG, no cycles! + +4) assume that if people want really specialized transforms, they can do it + themselves. + +""" + +from collections import Counter, namedtuple +import csv +import itertools +from itertools import product +from operator import attrgetter as aget, itemgetter as iget +import operator +import sys + + + +## note 'url' appears multiple places and not all extensions have same struct +ex1 = { + 'name': 'Gregg', + 'extensions': [ + {'id':'hello', + 'url':'url1'}, + {'id':'gbye', + 'url':'url2', + 'more': dict(url='url3')}, + ] +} + +## much longer example +ex2 = {u'metadata': {u'accessibilities': [{u'name': u'accessibility.tabfocus', + u'value': 7}, + {u'name': u'accessibility.mouse_focuses_formcontrol', u'value': False}, + {u'name': u'accessibility.browsewithcaret', u'value': False}, + {u'name': u'accessibility.win32.force_disabled', u'value': False}, + {u'name': u'accessibility.typeaheadfind.startlinksonly', u'value': False}, + {u'name': u'accessibility.usebrailledisplay', u'value': u''}, + {u'name': u'accessibility.typeaheadfind.timeout', u'value': 5000}, + {u'name': u'accessibility.typeaheadfind.enabletimeout', u'value': True}, + {u'name': u'accessibility.tabfocus_applies_to_xul', u'value': False}, + {u'name': u'accessibility.typeaheadfind.flashBar', u'value': 1}, + {u'name': u'accessibility.typeaheadfind.autostart', u'value': True}, + {u'name': u'accessibility.blockautorefresh', u'value': False}, + {u'name': u'accessibility.browsewithcaret_shortcut.enabled', + u'value': True}, + {u'name': u'accessibility.typeaheadfind.enablesound', u'value': True}, + {u'name': u'accessibility.typeaheadfind.prefillwithselection', + u'value': True}, + {u'name': u'accessibility.typeaheadfind.soundURL', u'value': u'beep'}, + {u'name': u'accessibility.typeaheadfind', u'value': False}, + {u'name': u'accessibility.typeaheadfind.casesensitive', u'value': 0}, + {u'name': u'accessibility.warn_on_browsewithcaret', u'value': True}, + {u'name': u'accessibility.usetexttospeech', u'value': u''}, + {u'name': u'accessibility.accesskeycausesactivation', u'value': True}, + {u'name': u'accessibility.typeaheadfind.linksonly', u'value': False}, + {u'name': u'isInstantiated', u'value': True}], + u'extensions': [{u'id': u'216ee7f7f4a5b8175374cd62150664efe2433a31', + u'isEnabled': True}, + {u'id': u'1aa53d3b720800c43c4ced5740a6e82bb0b3813e', u'isEnabled': False}, + {u'id': u'01ecfac5a7bd8c9e27b7c5499e71c2d285084b37', u'isEnabled': True}, + {u'id': u'1c01f5b22371b70b312ace94785f7b0b87c3dfb2', u'isEnabled': True}, + {u'id': u'fb723781a2385055f7d024788b75e959ad8ea8c3', u'isEnabled': True}], + u'fxVersion': u'9.0', + u'location': u'zh-CN', + u'operatingSystem': u'WINNT Windows NT 5.1', + u'surveyAnswers': u'', + u'task_guid': u'd69fbd15-2517-45b5-8a17-bb7354122a75', + u'tpVersion': u'1.2', + u'updateChannel': u'beta'}, + u'survey_data': { + u'extensions': [{u'appDisabled': False, + u'id': u'testpilot?labs.mozilla.com', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Test Pilot'}, + {u'appDisabled': True, + u'id': u'dict?www.youdao.com', + u'isCompatible': False, + u'isEnabled': False, + u'isPlatformCompatible': True, + u'name': u'Youdao Word Capturer'}, + {u'appDisabled': False, + u'id': u'jqs?sun.com', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Java Quick Starter'}, + {u'appDisabled': False, + u'id': u'?20a82645-c095-46ed-80e3-08825760534b?', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Microsoft .NET Framework Assistant'}, + {u'appDisabled': False, + u'id': u'?a0d7ccb3-214d-498b-b4aa-0e8fda9a7bf7?', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'WOT'}], + u'version_number': 1}} + +# class SurveyResult(object): + +# def __init__(self, record): +# self.record = record +# self.metadata, self.survey_data = self._flatten_results() + +# def _flatten_results(self): +# survey_data = self.record['survey_data'] +# extensions = DataFrame(survey_data['extensions']) + +def denorm(queries,iterable_of_things,default=None): + """ + 'repeat', or 'stutter' to 'tableize' for downstream. + (I have no idea what a good word for this is!) + + Think ``kronecker`` products, or: + + ``SELECT single,multiple FROM table;`` + + single multiple + ------- --------- + id1 val1 + id1 val2 + + + Args: + + queries: iterable of ``Q`` queries. + iterable_of_things: to be queried. + + Returns: + + list of 'stuttered' output, where if a query returns + a 'single', it gets repeated appropriately. + + + """ + + def _denorm(queries,thing): + fields = [] + results = [] + for q in queries: + #print q + r = Ql(q,thing) + #print "-- result: ", r + if not r: + r = [default] + if type(r[0]) is type({}): + fields.append(sorted(r[0].keys())) # dicty answers + else: + fields.append([q]) # stringy answer + + results.append(r) + + #print results + #print fields + flist = list(flatten(*map(iter,fields))) + + prod = itertools.product(*results) + for p in prod: + U = dict() + for (ii,thing) in enumerate(p): + #print ii,thing + if type(thing) is type({}): + U.update(thing) + else: + U[fields[ii][0]] = thing + + yield U + + return list(flatten(*[_denorm(queries,thing) for thing in iterable_of_things])) + + +def default_iget(fields,default=None,): + """ itemgetter with 'default' handling, that *always* returns lists + + API CHANGES from ``operator.itemgetter`` + + Note: Sorry to break the iget api... (fields vs *fields) + Note: *always* returns a list... unlike itemgetter, + which can return tuples or 'singles' + """ + myiget = operator.itemgetter(*fields) + L = len(fields) + def f(thing): + try: + ans = list(myiget(thing)) + if L < 2: + ans = [ans,] + return ans + except KeyError: + # slower! + return [thing.get(x,default) for x in fields] + + f.__doc__ = "itemgetter with default %r for fields %r" %(default,fields) + f.__name__ = "default_itemgetter" + return f + + +def flatten(*stack): + """ + helper function for flattening iterables of generators in a + sensible way. + """ + stack = list(stack) + while stack: + try: x = stack[0].next() + except StopIteration: + stack.pop(0) + continue + if hasattr(x,'next') and callable(getattr(x,'next')): + stack.insert(0, x) + + #if isinstance(x, (GeneratorType,listerator)): + else: yield x + + +def _Q(filter_, thing): + """ underlying machinery for Q function recursion """ + T = type(thing) + if T is type({}): + for k,v in thing.iteritems(): + #print k,v + if filter_ == k: + if type(v) is type([]): + yield iter(v) + else: + yield v + + if type(v) in (type({}),type([])): + yield Q(filter_,v) + + elif T is type([]): + for k in thing: + #print k + yield Q(filter_,k) + + else: + # no recursion. + pass + +def Q(filter_,thing): + """ + type(filter): + - list: a flattened list of all searches (one list) + - dict: dict with vals each of which is that search + + Notes: + + [1] 'parent thing', with space, will do a descendent + [2] this will come back 'flattened' jQuery style + [3] returns a generator. Use ``Ql`` if you want a list. + + """ + if type(filter_) is type([]): + return flatten(*[_Q(x,thing) for x in filter_]) + elif type(filter_) is type({}): + d = dict.fromkeys(filter_.keys()) + #print d + for k in d: + #print flatten(Q(k,thing)) + d[k] = Q(k,thing) + + return d + + else: + if " " in filter_: # i.e. "antecendent post" + parts = filter_.strip().split() + r = None + for p in parts: + r = Ql(p,thing) + thing = r + + return r + + else: # simple. + return flatten(_Q(filter_,thing)) + +def Ql(filter_,thing): + """ same as Q, but returns a list, not a generator """ + res = Q(filter_,thing) + + if type(filter_) is type({}): + for k in res: + res[k] = list(res[k]) + return res + + else: + return list(res) + + + +def countit(fields,iter_of_iter,default=None): + """ + note: robust to fields not being in i_of_i, using ``default`` + """ + C = Counter() # needs hashables + T = namedtuple("Thing",fields) + get = default_iget(*fields,default=default) + return Counter( + (T(*get(thing)) for thing in iter_of_iter) + ) + + +## right now this works for one row... +def printout(queries,things,default=None, f=sys.stdout, **kwargs): + """ will print header and objects + + **kwargs go to csv.DictWriter + + help(csv.DictWriter) for more. + """ + + results = denorm(queries,things,default=None) + fields = set(itertools.chain(*(x.keys() for x in results))) + + W = csv.DictWriter(f=f,fieldnames=fields,**kwargs) + #print "---prod---" + #print list(prod) + W.writeheader() + for r in results: + W.writerow(r) + + +def test_run(): + print "\n>>> print list(Q('url',ex1))" + print list(Q('url',ex1)) + assert list(Q('url',ex1)) == ['url1','url2','url3'] + assert Ql('url',ex1) == ['url1','url2','url3'] + + print "\n>>> print list(Q(['name','id'],ex1))" + print list(Q(['name','id'],ex1)) + assert Ql(['name','id'],ex1) == ['Gregg','hello','gbye'] + + + print "\n>>> print Ql('more url',ex1)" + print Ql('more url',ex1) + + + print "\n>>> list(Q('extensions',ex1))" + print list(Q('extensions',ex1)) + + print "\n>>> print Ql('extensions',ex1)" + print Ql('extensions',ex1) + + print "\n>>> printout(['name','extensions'],[ex1,], extrasaction='ignore')" + printout(['name','extensions'],[ex1,], extrasaction='ignore') + + print "\n\n" + + from pprint import pprint as pp + + print "-- note that the extension fields are also flattened! (and N/A) -- " + pp(denorm(['location','fxVersion','notthere','survey_data extensions'],[ex2,], default="N/A")[:2]) + + +if __name__ == "__main__": + pass From 9633880214ca6f6372ced250146368628014e3d0 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 7 Jun 2013 17:37:49 -0400 Subject: [PATCH 04/10] BLD: fix setup.py to work on current pandas --- pandas/io/tests/test_json/__init__.py | 0 pandas/src/ujson/lib/ultrajsondec.c | 845 ++++++++++++ pandas/src/ujson/lib/ultrajsonenc.c | 891 +++++++++++++ pandas/src/ujson/python/JSONtoObj.c | 674 ++++++++++ pandas/src/ujson/python/objToJSON.c | 1701 +++++++++++++++++++++++++ pandas/src/ujson/python/ujson.c | 73 ++ setup.py | 11 +- 7 files changed, 4193 insertions(+), 2 deletions(-) create mode 100644 pandas/io/tests/test_json/__init__.py create mode 100644 pandas/src/ujson/lib/ultrajsondec.c create mode 100644 pandas/src/ujson/lib/ultrajsonenc.c create mode 100644 pandas/src/ujson/python/JSONtoObj.c create mode 100644 pandas/src/ujson/python/objToJSON.c create mode 100644 pandas/src/ujson/python/ujson.c diff --git a/pandas/io/tests/test_json/__init__.py b/pandas/io/tests/test_json/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/src/ujson/lib/ultrajsondec.c new file mode 100644 index 0000000000000..eda30f3fea839 --- /dev/null +++ b/pandas/src/ujson/lib/ultrajsondec.c @@ -0,0 +1,845 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +#include "ultrajson.h" +#include +#include +#include +#include +#include + +struct DecoderState +{ + char *start; + char *end; + wchar_t *escStart; + wchar_t *escEnd; + int escHeap; + int lastType; + JSONObjectDecoder *dec; +}; + +JSOBJ FASTCALL_MSVC decode_any( struct DecoderState *ds) FASTCALL_ATTR; +typedef JSOBJ (*PFN_DECODER)( struct DecoderState *ds); +#define RETURN_JSOBJ_NULLCHECK(_expr) return(_expr); + +double createDouble(double intNeg, double intValue, double frcValue, int frcDecimalCount) +{ + static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000}; + + return (intValue + (frcValue / g_pow10[frcDecimalCount])) * intNeg; +} + +static JSOBJ SetError( struct DecoderState *ds, int offset, const char *message) +{ + ds->dec->errorOffset = ds->start + offset; + ds->dec->errorStr = (char *) message; + return NULL; +} + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric ( struct DecoderState *ds) +{ +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + double intNeg = 1; + double intValue; +#else + int intNeg = 1; + JSLONG intValue; +#endif + + double expNeg; + int chr; + int decimalCount = 0; + double frcValue = 0.0; + double expValue; + char *offset = ds->start; + + if (*(offset) == '-') + { + offset ++; + intNeg = -1; + } + + // Scan integer part + intValue = 0; + + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + //FIXME: Check for arithemtic overflow here + //PERF: Don't do 64-bit arithmetic here unless we know we have to +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + intValue = intValue * 10.0 + (double) (chr - 48); +#else + intValue = intValue * 10LL + (JSLONG) (chr - 48); +#endif + offset ++; + break; + + case '.': + offset ++; + goto DECODE_FRACTION; + break; + + case 'e': + case 'E': + offset ++; + goto DECODE_EXPONENT; + break; + + default: + goto BREAK_INT_LOOP; + break; + } + } + +BREAK_INT_LOOP: + + ds->lastType = JT_INT; + ds->start = offset; + + //If input string is LONGLONG_MIN here the value is already negative so we should not flip it + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE +#else + if (intValue < 0) + { + intNeg = 1; + } +#endif + + //dbg1 = (intValue * intNeg); + //dbg2 = (JSLONG) dbg1; + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + if (intValue > (double) INT_MAX || intValue < (double) INT_MIN) +#else + if ( (intValue >> 31)) +#endif + { + RETURN_JSOBJ_NULLCHECK(ds->dec->newLong( (JSINT64) (intValue * (JSINT64) intNeg))); + } + else + { + RETURN_JSOBJ_NULLCHECK(ds->dec->newInt( (JSINT32) (intValue * intNeg))); + } + + + +DECODE_FRACTION: + + // Scan fraction part + frcValue = 0.0; + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) + { + frcValue = frcValue * 10.0 + (double) (chr - 48); + decimalCount ++; + } + offset ++; + break; + + case 'e': + case 'E': + offset ++; + goto DECODE_EXPONENT; + break; + + default: + goto BREAK_FRC_LOOP; + } + } + +BREAK_FRC_LOOP: + + if (intValue < 0) + { + intNeg = 1; + } + + //FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newDouble (createDouble( (double) intNeg, (double) intValue, frcValue, decimalCount))); + +DECODE_EXPONENT: + expNeg = 1.0; + + if (*(offset) == '-') + { + expNeg = -1.0; + offset ++; + } + else + if (*(offset) == '+') + { + expNeg = +1.0; + offset ++; + } + + expValue = 0.0; + + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + expValue = expValue * 10.0 + (double) (chr - 48); + offset ++; + break; + + default: + goto BREAK_EXP_LOOP; + + } + } + +BREAK_EXP_LOOP: + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE +#else + if (intValue < 0) + { + intNeg = 1; + } +#endif + + //FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newDouble (createDouble( (double) intNeg, (double) intValue , frcValue, decimalCount) * pow(10.0, expValue * expNeg))); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'r') + goto SETERROR; + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_TRUE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newTrue()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'true'"); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'a') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 's') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_FALSE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newFalse()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'false'"); + +} + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newNull()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'null'"); +} + +FASTCALL_ATTR void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) +{ + char *offset = ds->start; + + while (1) + { + switch (*offset) + { + case ' ': + case '\t': + case '\r': + case '\n': + offset ++; + break; + + default: + ds->start = offset; + return; + } + } +} + + +enum DECODESTRINGSTATE +{ + DS_ISNULL = 0x32, + DS_ISQUOTE, + DS_ISESCAPE, + DS_UTFLENERROR, + +}; + +static const JSUINT8 g_decoderLookup[256] = +{ +/* 0x00 */ DS_ISNULL, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x20 */ 1, 1, DS_ISQUOTE, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, DS_ISESCAPE, 1, 1, 1, +/* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, +}; + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) +{ + JSUTF16 sur[2] = { 0 }; + int iSur = 0; + int index; + wchar_t *escOffset; + size_t escLen = (ds->escEnd - ds->escStart); + JSUINT8 *inputOffset; + JSUINT8 oct; + JSUTF32 ucs; + ds->lastType = JT_INVALID; + ds->start ++; + + if ( (ds->end - ds->start) > escLen) + { + size_t newSize = (ds->end - ds->start); + + if (ds->escHeap) + { + ds->escStart = (wchar_t *) ds->dec->realloc (ds->escStart, newSize * sizeof(wchar_t)); + if (!ds->escStart) + { + return SetError(ds, -1, "Could not reserve memory block"); + } + } + else + { + wchar_t *oldStart = ds->escStart; + ds->escHeap = 1; + ds->escStart = (wchar_t *) ds->dec->malloc (newSize * sizeof(wchar_t)); + if (!ds->escStart) + { + return SetError(ds, -1, "Could not reserve memory block"); + } + memcpy (ds->escStart, oldStart, escLen * sizeof(wchar_t)); + } + + ds->escEnd = ds->escStart + newSize; + } + + escOffset = ds->escStart; + inputOffset = ds->start; + + while(1) + { + switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) + { + case DS_ISNULL: + return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'"); + + case DS_ISQUOTE: + ds->lastType = JT_UTF8; + inputOffset ++; + ds->start += ( (char *) inputOffset - (ds->start)); + RETURN_JSOBJ_NULLCHECK(ds->dec->newString(ds->escStart, escOffset)); + + case DS_UTFLENERROR: + return SetError (ds, -1, "Invalid UTF-8 sequence length when decoding 'string'"); + + case DS_ISESCAPE: + inputOffset ++; + switch (*inputOffset) + { + case '\\': *(escOffset++) = L'\\'; inputOffset++; continue; + case '\"': *(escOffset++) = L'\"'; inputOffset++; continue; + case '/': *(escOffset++) = L'/'; inputOffset++; continue; + case 'b': *(escOffset++) = L'\b'; inputOffset++; continue; + case 'f': *(escOffset++) = L'\f'; inputOffset++; continue; + case 'n': *(escOffset++) = L'\n'; inputOffset++; continue; + case 'r': *(escOffset++) = L'\r'; inputOffset++; continue; + case 't': *(escOffset++) = L'\t'; inputOffset++; continue; + + case 'u': + { + int index; + inputOffset ++; + + for (index = 0; index < 4; index ++) + { + switch (*inputOffset) + { + case '\0': return SetError (ds, -1, "Unterminated unicode escape sequence when decoding 'string'"); + default: return SetError (ds, -1, "Unexpected character in unicode escape sequence when decoding 'string'"); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0'); + break; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); + break; + + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); + break; + } + + inputOffset ++; + } + + + if (iSur == 0) + { + if((sur[iSur] & 0xfc00) == 0xd800) + { + // First of a surrogate pair, continue parsing + iSur ++; + break; + } + (*escOffset++) = (wchar_t) sur[iSur]; + iSur = 0; + } + else + { + // Decode pair + if ((sur[1] & 0xfc00) != 0xdc00) + { + return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'"); + } + +#if WCHAR_MAX == 0xffff + (*escOffset++) = (wchar_t) sur[0]; + (*escOffset++) = (wchar_t) sur[1]; +#else + (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); +#endif + iSur = 0; + } + break; + } + + case '\0': return SetError(ds, -1, "Unterminated escape sequence when decoding 'string'"); + default: return SetError(ds, -1, "Unrecognized escape sequence when decoding 'string'"); + } + break; + + case 1: + *(escOffset++) = (wchar_t) (*inputOffset++); + break; + + case 2: + { + ucs = (*inputOffset++) & 0x1f; + ucs <<= 6; + if (((*inputOffset) & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + ucs |= (*inputOffset++) & 0x3f; + if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'"); + *(escOffset++) = (wchar_t) ucs; + break; + } + + case 3: + { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x0f; + + for (index = 0; index < 2; index ++) + { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string"); + *(escOffset++) = (wchar_t) ucs; + break; + } + + case 4: + { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x07; + + for (index = 0; index < 3; index ++) + { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'"); + + #if WCHAR_MAX == 0xffff + if (ucs >= 0x10000) + { + ucs -= 0x10000; + *(escOffset++) = (ucs >> 10) + 0xd800; + *(escOffset++) = (ucs & 0x3ff) + 0xdc00; + } + else + { + *(escOffset++) = (wchar_t) ucs; + } + #else + *(escOffset++) = (wchar_t) ucs; + #endif + break; + } + } + } +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array( struct DecoderState *ds) +{ + JSOBJ itemValue; + JSOBJ newObj = ds->dec->newArray(ds->dec); + + ds->lastType = JT_INVALID; + ds->start ++; + + while (1)//(*ds->start) != '\0') + { + SkipWhitespace(ds); + + if ((*ds->start) == ']') + { + ds->start++; + return ds->dec->endArray(newObj); + } + + itemValue = decode_any(ds); + + if (itemValue == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + if (!ds->dec->arrayAddItem (newObj, itemValue)) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) + { + case ']': + return ds->dec->endArray(newObj); + + case ',': + break; + + default: + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unexpected character in found when decoding array value"); + } + } + + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unmatched ']' when decoding 'array'"); +} + + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object( struct DecoderState *ds) +{ + JSOBJ itemName; + JSOBJ itemValue; + JSOBJ newObj = ds->dec->newObject(ds->dec); + + ds->start ++; + + while (1) + { + SkipWhitespace(ds); + + if ((*ds->start) == '}') + { + ds->start ++; + return ds->dec->endObject(newObj); + } + + ds->lastType = JT_INVALID; + itemName = decode_any(ds); + + if (itemName == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + if (ds->lastType != JT_UTF8) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return SetError(ds, -1, "Key name of object must be 'string' when decoding 'object'"); + } + + SkipWhitespace(ds); + + if (*(ds->start++) != ':') + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return SetError(ds, -1, "No ':' found when decoding object value"); + } + + SkipWhitespace(ds); + + itemValue = decode_any(ds); + + if (itemValue == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return NULL; + } + + if (!ds->dec->objectAddKey (newObj, itemName, itemValue)) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + ds->dec->releaseObject(itemValue, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) + { + case '}': + return ds->dec->endObject(newObj); + + case ',': + break; + + default: + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unexpected character in found when decoding object value"); + } + } + + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unmatched '}' when decoding object"); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) +{ + while (1) + { + switch (*ds->start) + { + case '\"': + return decode_string (ds); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + return decode_numeric (ds); + + case '[': return decode_array (ds); + case '{': return decode_object (ds); + case 't': return decode_true (ds); + case 'f': return decode_false (ds); + case 'n': return decode_null (ds); + + case ' ': + case '\t': + case '\r': + case '\n': + // White space + ds->start ++; + break; + + default: + return SetError(ds, -1, "Expected object or value"); + } + } +} + + +JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer) +{ + + /* + FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */ + struct DecoderState ds; + wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSOBJ ret; + + ds.start = (char *) buffer; + ds.end = ds.start + cbBuffer; + + ds.escStart = escBuffer; + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escHeap = 0; + ds.dec = dec; + ds.dec->errorStr = NULL; + ds.dec->errorOffset = NULL; + + ds.dec = dec; + + ret = decode_any (&ds); + + if (ds.escHeap) + { + dec->free(ds.escStart); + } + return ret; +} diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c new file mode 100644 index 0000000000000..22871513870b7 --- /dev/null +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -0,0 +1,891 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +#include "ultrajson.h" +#include +#include +#include +#include +#include + +#include + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000}; +static const char g_hexChars[] = "0123456789abcdef"; +static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/"; + + +/* +FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands. +Needs a cleanup and more documentation */ + +/* +Table for pure ascii output escaping all characters above 127 to \uXXXX */ +static const JSUINT8 g_asciiOutputTable[256] = +{ +/* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30, +/* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, +/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, +/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1, +/* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 +}; + + +static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message) +{ + enc->errorMsg = message; + enc->errorObj = obj; +} + +/* +FIXME: Keep track of how big these get across several encoder calls and try to make an estimate +That way we won't run our head into the wall each call */ +void Buffer_Realloc (JSONObjectEncoder *enc, size_t cbNeeded) +{ + size_t curSize = enc->end - enc->start; + size_t newSize = curSize * 2; + size_t offset = enc->offset - enc->start; + + while (newSize < curSize + cbNeeded) + { + newSize *= 2; + } + + if (enc->heap) + { + enc->start = (char *) enc->realloc (enc->start, newSize); + if (!enc->start) + { + SetError (NULL, enc, "Could not reserve memory block"); + return; + } + } + else + { + char *oldStart = enc->start; + enc->heap = 1; + enc->start = (char *) enc->malloc (newSize); + if (!enc->start) + { + SetError (NULL, enc, "Could not reserve memory block"); + return; + } + memcpy (enc->start, oldStart, offset); + } + enc->offset = enc->start + offset; + enc->end = enc->start + newSize; +} + +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (char *outputOffset, unsigned short value) +{ + *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; + *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; + *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; + *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; +} + +int Buffer_EscapeStringUnvalidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +{ + char *of = (char *) enc->offset; + + while (1) + { + switch (*io) + { + case 0x00: + if (io < end) + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + break; + } + else + { + enc->offset += (of - enc->offset); + return TRUE; + } + + case '\"': (*of++) = '\\'; (*of++) = '\"'; break; + case '\\': (*of++) = '\\'; (*of++) = '\\'; break; + case '/': (*of++) = '\\'; (*of++) = '/'; break; + case '\b': (*of++) = '\\'; (*of++) = 'b'; break; + case '\f': (*of++) = '\\'; (*of++) = 'f'; break; + case '\n': (*of++) = '\\'; (*of++) = 'n'; break; + case '\r': (*of++) = '\\'; (*of++) = 'r'; break; + case '\t': (*of++) = '\\'; (*of++) = 't'; break; + + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x0b: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + break; + + default: (*of++) = (*io); break; + } + + io++; + } + + return FALSE; +} + + +/* +FIXME: +This code only works with Little and Big Endian + +FIXME: The JSON spec says escape "/" but non of the others do and we don't +want to be left alone doing it so we don't :) + +*/ +int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +{ + JSUTF32 ucs; + char *of = (char *) enc->offset; + + while (1) + { + + //JSUINT8 chr = (unsigned char) *io; + JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io]; + + switch (utflen) + { + case 0: + { + if (io < end) + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + io ++; + continue; + } + else + { + enc->offset += (of - enc->offset); + return TRUE; + } + } + + case 1: + { + *(of++)= (*io++); + continue; + } + + case 2: + { + JSUTF32 in; + JSUTF16 in16; + + if (end - io < 1) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + in = (JSUTF32) in16; + +#ifdef __LITTLE_ENDIAN__ + ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); +#else + ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); +#endif + + if (ucs < 0x80) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 2 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 2; + break; + } + + case 3: + { + JSUTF32 in; + JSUTF16 in16; + JSUINT8 in8; + + if (end - io < 2) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + memcpy(&in8, io + 2, sizeof(JSUINT8)); +#ifdef __LITTLE_ENDIAN__ + in = (JSUTF32) in16; + in |= in8 << 16; + ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16); +#else + in = in16 << 8; + in |= in8; + ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); +#endif + + + if (ucs < 0x800) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 3 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 3; + break; + } + case 4: + { + JSUTF32 in; + + if (end - io < 3) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in, io, sizeof(JSUTF32)); +#ifdef __LITTLE_ENDIAN__ + ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); +#else + ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); +#endif + if (ucs < 0x10000) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 4 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 4; + break; + } + + + case 5: + case 6: + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unsupported UTF-8 sequence length when encoding string"); + return FALSE; + + case 30: + // \uXXXX encode + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + io ++; + continue; + + case 10: + case 12: + case 14: + case 16: + case 18: + case 20: + case 22: + case 24: + *(of++) = *( (char *) (g_escapeChars + utflen + 0)); + *(of++) = *( (char *) (g_escapeChars + utflen + 1)); + io ++; + continue; + } + + /* + If the character is a UTF8 sequence of length > 1 we end up here */ + if (ucs >= 0x10000) + { + ucs -= 0x10000; + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (ucs >> 10) + 0xd800); + of += 4; + + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (ucs & 0x3ff) + 0xdc00); + of += 4; + } + else + { + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, ucs); + of += 4; + } + } + + return FALSE; +} + +#define Buffer_Reserve(__enc, __len) \ + if ((__enc)->end - (__enc)->offset < (__len)) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + + +#define Buffer_AppendCharUnchecked(__enc, __chr) \ + *((__enc)->offset++) = __chr; \ + +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char* begin, char* end) +{ + char aux; + while (end > begin) + aux = *end, *end-- = *begin, *begin++ = aux; +} + +void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) +{ + char* wstr; + JSUINT32 uvalue = (value < 0) ? -value : value; + + wstr = enc->offset; + // Conversion. Number is reversed. + + do *wstr++ = (char)(48 + (uvalue % 10)); while(uvalue /= 10); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset,wstr - 1); + enc->offset += (wstr - (enc->offset)); +} + +void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) +{ + char* wstr; + JSUINT64 uvalue = (value < 0) ? -value : value; + + wstr = enc->offset; + // Conversion. Number is reversed. + + do *wstr++ = (char)(48 + (uvalue % 10ULL)); while(uvalue /= 10ULL); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset,wstr - 1); + enc->offset += (wstr - (enc->offset)); +} + +int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, double value) +{ + /* if input is larger than thres_max, revert to exponential */ + const double thres_max = (double) 1e16 - 1; + int count; + double diff = 0.0; + char* str = enc->offset; + char* wstr = str; + unsigned long long whole; + double tmp; + unsigned long long frac; + int neg; + double pow10; + + if (value == HUGE_VAL || value == -HUGE_VAL) + { + SetError (obj, enc, "Invalid Inf value when encoding double"); + return FALSE; + } + if (! (value == value)) + { + SetError (obj, enc, "Invalid Nan value when encoding double"); + return FALSE; + } + + + /* we'll work in positive values and deal with the + negative sign issue later */ + neg = 0; + if (value < 0) + { + neg = 1; + value = -value; + } + + pow10 = g_pow10[enc->doublePrecision]; + + whole = (unsigned long long) value; + tmp = (value - whole) * pow10; + frac = (unsigned long long)(tmp); + diff = tmp - frac; + + if (diff > 0.5) + { + ++frac; + /* handle rollover, e.g. case 0.99 with prec 1 is 1.0 */ + if (frac >= pow10) + { + frac = 0; + ++whole; + } + } + else + if (diff == 0.5 && ((frac == 0) || (frac & 1))) + { + /* if halfway, round up if odd, OR + if last digit is 0. That last part is strange */ + ++frac; + } + + /* for very large numbers switch back to native sprintf for exponentials. + anyone want to write code to replace this? */ + /* + normal printf behavior is to print EVERY whole number digit + which can be 100s of characters overflowing your buffers == bad + */ + if (value > thres_max) + { + enc->offset += sprintf(str, "%.15e", neg ? -value : value); + return TRUE; + } + + if (enc->doublePrecision == 0) + { + diff = value - whole; + + if (diff > 0.5) + { + /* greater than 0.5, round up, e.g. 1.6 -> 2 */ + ++whole; + } + else + if (diff == 0.5 && (whole & 1)) + { + /* exactly 0.5 and ODD, then round up */ + /* 1.5 -> 2, but 2.5 -> 2 */ + ++whole; + } + + //vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 + } + else + if (frac) + { + count = enc->doublePrecision; + // now do fractional part, as an unsigned number + // we know it is not 0 but we can have leading zeros, these + // should be removed + while (!(frac % 10)) + { + --count; + frac /= 10; + } + //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 + + // now do fractional part, as an unsigned number + do + { + --count; + *wstr++ = (char)(48 + (frac % 10)); + } while (frac /= 10); + // add extra 0s + while (count-- > 0) + { + *wstr++ = '0'; + } + // add decimal + *wstr++ = '.'; + } + else + { + *wstr++ = '0'; + *wstr++ = '.'; + } + + // do whole part + // Take care of sign + // Conversion. Number is reversed. + do *wstr++ = (char)(48 + (whole % 10)); while (whole /= 10); + + if (neg) + { + *wstr++ = '-'; + } + strreverse(str, wstr-1); + enc->offset += (wstr - (enc->offset)); + + return TRUE; +} + + + + + + +/* +FIXME: +Handle integration functions returning NULL here */ + +/* +FIXME: +Perhaps implement recursion detection */ + +void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) +{ + const char *value; + char *objName; + int count; + JSOBJ iterObj; + size_t szlen; + JSONTypeContext tc; + tc.encoder = enc; + + if (enc->level > enc->recursionMax) + { + SetError (obj, enc, "Maximum recursion level reached"); + return; + } + + /* + This reservation must hold + + length of _name as encoded worst case + + maxLength of double to string OR maxLength of JSLONG to string + + Since input is assumed to be UTF-8 the worst character length is: + + 4 bytes (of UTF-8) => "\uXXXX\uXXXX" (12 bytes) + */ + + Buffer_Reserve(enc, 256 + (((cbName / 4) + 1) * 12)); + if (enc->errorMsg) + { + return; + } + + if (name) + { + Buffer_AppendCharUnchecked(enc, '\"'); + + if (enc->forceASCII) + { + if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) + { + return; + } + } + else + { + if (!Buffer_EscapeStringUnvalidated(obj, enc, name, name + cbName)) + { + return; + } + } + + + Buffer_AppendCharUnchecked(enc, '\"'); + + Buffer_AppendCharUnchecked (enc, ':'); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (enc, ' '); +#endif + } + + enc->beginTypeContext(obj, &tc); + + switch (tc.type) + { + case JT_INVALID: + return; + + case JT_ARRAY: + { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked (enc, '['); + + while (enc->iterNext(obj, &tc)) + { + if (count > 0) + { + Buffer_AppendCharUnchecked (enc, ','); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (buffer, ' '); +#endif + } + + iterObj = enc->iterGetValue(obj, &tc); + + enc->level ++; + encode (iterObj, enc, NULL, 0); + count ++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked (enc, ']'); + break; + } + + case JT_OBJECT: + { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked (enc, '{'); + + while (enc->iterNext(obj, &tc)) + { + if (count > 0) + { + Buffer_AppendCharUnchecked (enc, ','); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (enc, ' '); +#endif + } + + iterObj = enc->iterGetValue(obj, &tc); + objName = enc->iterGetName(obj, &tc, &szlen); + + enc->level ++; + encode (iterObj, enc, objName, szlen); + count ++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked (enc, '}'); + break; + } + + case JT_LONG: + { + Buffer_AppendLongUnchecked (enc, enc->getLongValue(obj, &tc)); + break; + } + + case JT_INT: + { + Buffer_AppendIntUnchecked (enc, enc->getIntValue(obj, &tc)); + break; + } + + case JT_TRUE: + { + Buffer_AppendCharUnchecked (enc, 't'); + Buffer_AppendCharUnchecked (enc, 'r'); + Buffer_AppendCharUnchecked (enc, 'u'); + Buffer_AppendCharUnchecked (enc, 'e'); + break; + } + + case JT_FALSE: + { + Buffer_AppendCharUnchecked (enc, 'f'); + Buffer_AppendCharUnchecked (enc, 'a'); + Buffer_AppendCharUnchecked (enc, 'l'); + Buffer_AppendCharUnchecked (enc, 's'); + Buffer_AppendCharUnchecked (enc, 'e'); + break; + } + + + case JT_NULL: + { + Buffer_AppendCharUnchecked (enc, 'n'); + Buffer_AppendCharUnchecked (enc, 'u'); + Buffer_AppendCharUnchecked (enc, 'l'); + Buffer_AppendCharUnchecked (enc, 'l'); + break; + } + + case JT_DOUBLE: + { + if (!Buffer_AppendDoubleUnchecked (obj, enc, enc->getDoubleValue(obj, &tc))) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + break; + } + + case JT_UTF8: + { + value = enc->getStringValue(obj, &tc, &szlen); + Buffer_Reserve(enc, ((szlen / 4) + 1) * 12); + if (enc->errorMsg) + { + enc->endTypeContext(obj, &tc); + return; + } + Buffer_AppendCharUnchecked (enc, '\"'); + + + if (enc->forceASCII) + { + if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + } + else + { + if (!Buffer_EscapeStringUnvalidated(obj, enc, value, value + szlen)) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + } + + Buffer_AppendCharUnchecked (enc, '\"'); + break; + } + } + + enc->endTypeContext(obj, &tc); + enc->level --; + +} + +char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer) +{ + enc->malloc = enc->malloc ? enc->malloc : malloc; + enc->free = enc->free ? enc->free : free; + enc->realloc = enc->realloc ? enc->realloc : realloc; + enc->errorMsg = NULL; + enc->errorObj = NULL; + enc->level = 0; + + if (enc->recursionMax < 1) + { + enc->recursionMax = JSON_MAX_RECURSION_DEPTH; + } + + if (enc->doublePrecision < 0 || + enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) + { + enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; + } + + if (_buffer == NULL) + { + _cbBuffer = 32768; + enc->start = (char *) enc->malloc (_cbBuffer); + if (!enc->start) + { + SetError(obj, enc, "Could not reserve memory block"); + return NULL; + } + enc->heap = 1; + } + else + { + enc->start = _buffer; + enc->heap = 0; + } + + enc->end = enc->start + _cbBuffer; + enc->offset = enc->start; + + + encode (obj, enc, NULL, 0); + + Buffer_Reserve(enc, 1); + if (enc->errorMsg) + { + return NULL; + } + Buffer_AppendCharUnchecked(enc, '\0'); + + return enc->start; +} diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c new file mode 100644 index 0000000000000..1db7586ad17f7 --- /dev/null +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -0,0 +1,674 @@ +#include "py_defines.h" +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY +#define NO_IMPORT_ARRAY +#include +#include + + +typedef struct __PyObjectDecoder +{ + JSONObjectDecoder dec; + + void* npyarr; // Numpy context buffer + npy_intp curdim; // Current array dimension + + PyArray_Descr* dtype; +} PyObjectDecoder; + +typedef struct __NpyArrContext +{ + PyObject* ret; + PyObject* labels[2]; + PyArray_Dims shape; + + PyObjectDecoder* dec; + + npy_intp i; + npy_intp elsize; + npy_intp elcount; +} NpyArrContext; + +//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +#define PRINTMARK() + +// Numpy handling based on numpy internal code, specifically the function +// PyArray_FromIter. + +// numpy related functions are inter-dependent so declare them all here, +// to ensure the compiler catches any errors + +// standard numpy array handling +JSOBJ Object_npyNewArray(void* decoder); +JSOBJ Object_npyEndArray(JSOBJ obj); +int Object_npyArrayAddItem(JSOBJ obj, JSOBJ value); + +// for more complex dtypes (object and string) fill a standard Python list +// and convert to a numpy array when done. +JSOBJ Object_npyNewArrayList(void* decoder); +JSOBJ Object_npyEndArrayList(JSOBJ obj); +int Object_npyArrayListAddItem(JSOBJ obj, JSOBJ value); + +// labelled support, encode keys and values of JS object into separate numpy +// arrays +JSOBJ Object_npyNewObject(void* decoder); +JSOBJ Object_npyEndObject(JSOBJ obj); +int Object_npyObjectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value); + + +// free the numpy context buffer +void Npy_releaseContext(NpyArrContext* npyarr) +{ + PRINTMARK(); + if (npyarr) + { + if (npyarr->shape.ptr) + { + PyObject_Free(npyarr->shape.ptr); + } + if (npyarr->dec) + { + // Don't set to null, used to make sure we don't Py_DECREF npyarr + // in releaseObject + // npyarr->dec->npyarr = NULL; + npyarr->dec->curdim = 0; + } + Py_XDECREF(npyarr->labels[0]); + Py_XDECREF(npyarr->labels[1]); + Py_XDECREF(npyarr->ret); + PyObject_Free(npyarr); + } +} + +JSOBJ Object_npyNewArray(void* _decoder) +{ + NpyArrContext* npyarr; + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + if (decoder->curdim <= 0) + { + // start of array - initialise the context buffer + npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + + if (!npyarr) + { + PyErr_NoMemory(); + return NULL; + } + + npyarr->dec = decoder; + npyarr->labels[0] = npyarr->labels[1] = NULL; + + npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp)*NPY_MAXDIMS); + npyarr->shape.len = 1; + npyarr->ret = NULL; + + npyarr->elsize = 0; + npyarr->elcount = 4; + npyarr->i = 0; + } + else + { + // starting a new dimension continue the current array (and reshape after) + npyarr = (NpyArrContext*) decoder->npyarr; + if (decoder->curdim >= npyarr->shape.len) + { + npyarr->shape.len++; + } + } + + npyarr->shape.ptr[decoder->curdim] = 0; + decoder->curdim++; + return npyarr; +} + +PyObject* Npy_returnLabelled(NpyArrContext* npyarr) +{ + PyObject* ret = npyarr->ret; + npy_intp i; + + if (npyarr->labels[0] || npyarr->labels[1]) + { + // finished decoding, build tuple with values and labels + ret = PyTuple_New(npyarr->shape.len+1); + for (i = 0; i < npyarr->shape.len; i++) + { + if (npyarr->labels[i]) + { + PyTuple_SET_ITEM(ret, i+1, npyarr->labels[i]); + npyarr->labels[i] = NULL; + } + else + { + Py_INCREF(Py_None); + PyTuple_SET_ITEM(ret, i+1, Py_None); + } + } + PyTuple_SET_ITEM(ret, 0, npyarr->ret); + } + + return ret; +} + +JSOBJ Object_npyEndArray(JSOBJ obj) +{ + PyObject *ret; + char* new_data; + NpyArrContext* npyarr = (NpyArrContext*) obj; + int emptyType = NPY_DEFAULT_TYPE; + npy_intp i; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + ret = npyarr->ret; + i = npyarr->i; + + npyarr->dec->curdim--; + + if (i == 0 || !npyarr->ret) { + // empty array would not have been initialised so do it now. + if (npyarr->dec->dtype) + { + emptyType = npyarr->dec->dtype->type_num; + } + npyarr->ret = ret = PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); + } + else if (npyarr->dec->curdim <= 0) + { + // realloc to final size + new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); + if (new_data == NULL) { + PyErr_NoMemory(); + Npy_releaseContext(npyarr); + return NULL; + } + ((PyArrayObject*) ret)->data = (void*) new_data; + // PyArray_BYTES(ret) = new_data; + } + + if (npyarr->dec->curdim <= 0) + { + // finished decoding array, reshape if necessary + if (npyarr->shape.len > 1) + { + npyarr->ret = PyArray_Newshape((PyArrayObject*) ret, &npyarr->shape, NPY_ANYORDER); + Py_DECREF(ret); + } + + ret = Npy_returnLabelled(npyarr); + + npyarr->ret = NULL; + Npy_releaseContext(npyarr); + } + + return ret; +} + +int Object_npyArrayAddItem(JSOBJ obj, JSOBJ value) +{ + PyObject* type; + PyArray_Descr* dtype; + npy_intp i; + char *new_data, *item; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + + i = npyarr->i; + + npyarr->shape.ptr[npyarr->dec->curdim-1]++; + + if (PyArray_Check((PyObject*)value)) + { + // multidimensional array, keep decoding values. + return 1; + } + + if (!npyarr->ret) + { + // Array not initialised yet. + // We do it here so we can 'sniff' the data type if none was provided + if (!npyarr->dec->dtype) + { + type = PyObject_Type(value); + if(!PyArray_DescrConverter(type, &dtype)) + { + Py_DECREF(type); + goto fail; + } + Py_INCREF(dtype); + Py_DECREF(type); + } + else + { + dtype = PyArray_DescrNew(npyarr->dec->dtype); + } + + // If it's an object or string then fill a Python list and subsequently + // convert. Otherwise we would need to somehow mess about with + // reference counts when renewing memory. + npyarr->elsize = dtype->elsize; + if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) + { + Py_XDECREF(dtype); + + if (npyarr->dec->curdim > 1) + { + PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); + goto fail; + } + npyarr->elcount = 0; + npyarr->ret = PyList_New(0); + if (!npyarr->ret) + { + goto fail; + } + ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArrayList; + ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayListAddItem; + ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArrayList; + return Object_npyArrayListAddItem(obj, value); + } + + npyarr->ret = PyArray_NewFromDescr(&PyArray_Type, dtype, 1, + &npyarr->elcount, NULL,NULL, 0, NULL); + + if (!npyarr->ret) + { + goto fail; + } + } + + if (i >= npyarr->elcount) { + // Grow PyArray_DATA(ret): + // this is similar for the strategy for PyListObject, but we use + // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... + if (npyarr->elsize == 0) + { + PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); + goto fail; + } + + npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; + if (npyarr->elcount <= NPY_MAX_INTP/npyarr->elsize) { + new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), npyarr->elcount * npyarr->elsize); + } + else { + PyErr_NoMemory(); + goto fail; + } + ((PyArrayObject*) npyarr->ret)->data = (void*) new_data; + + // PyArray_BYTES(npyarr->ret) = new_data; + } + + PyArray_DIMS(npyarr->ret)[0] = i + 1; + + if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL + || PyArray_SETITEM(npyarr->ret, item, value) == -1) { + goto fail; + } + + Py_DECREF( (PyObject *) value); + npyarr->i++; + return 1; + +fail: + + Npy_releaseContext(npyarr); + return 0; +} + +JSOBJ Object_npyNewArrayList(void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + PyErr_SetString(PyExc_ValueError, "nesting not supported for object or variable length dtypes"); + Npy_releaseContext(decoder->npyarr); + return NULL; +} + +JSOBJ Object_npyEndArrayList(JSOBJ obj) +{ + PyObject *list, *ret; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + // convert decoded list to numpy array + list = (PyObject *) npyarr->ret; + npyarr->ret = PyArray_FROM_O(list); + + ret = Npy_returnLabelled(npyarr); + npyarr->ret = list; + + ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArray; + ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; + ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArray; + Npy_releaseContext(npyarr); + return ret; +} + +int Object_npyArrayListAddItem(JSOBJ obj, JSOBJ value) +{ + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + PyList_Append((PyObject*) npyarr->ret, value); + Py_DECREF( (PyObject *) value); + npyarr->elcount++; + return 1; +} + + +JSOBJ Object_npyNewObject(void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + if (decoder->curdim > 1) + { + PyErr_SetString(PyExc_ValueError, "labels only supported up to 2 dimensions"); + return NULL; + } + + return ((JSONObjectDecoder*)decoder)->newArray(decoder); +} + +JSOBJ Object_npyEndObject(JSOBJ obj) +{ + PyObject *list; + npy_intp labelidx; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + labelidx = npyarr->dec->curdim-1; + + list = npyarr->labels[labelidx]; + if (list) + { + npyarr->labels[labelidx] = PyArray_FROM_O(list); + Py_DECREF(list); + } + + return (PyObject*) ((JSONObjectDecoder*)npyarr->dec)->endArray(obj); +} + +int Object_npyObjectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value) +{ + PyObject *label; + npy_intp labelidx; + // add key to label array, value to values array + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + + label = (PyObject*) name; + labelidx = npyarr->dec->curdim-1; + + if (!npyarr->labels[labelidx]) + { + npyarr->labels[labelidx] = PyList_New(0); + } + + // only fill label array once, assumes all column labels are the same + // for 2-dimensional arrays. + if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) + { + PyList_Append(npyarr->labels[labelidx], label); + } + + if(((JSONObjectDecoder*)npyarr->dec)->arrayAddItem(obj, value)) + { + Py_DECREF(label); + return 1; + } + return 0; +} + +int Object_objectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value) +{ + PyDict_SetItem (obj, name, value); + Py_DECREF( (PyObject *) name); + Py_DECREF( (PyObject *) value); + return 1; +} + +int Object_arrayAddItem(JSOBJ obj, JSOBJ value) +{ + PyList_Append(obj, value); + Py_DECREF( (PyObject *) value); + return 1; +} + +JSOBJ Object_newString(wchar_t *start, wchar_t *end) +{ + return PyUnicode_FromWideChar (start, (end - start)); +} + +JSOBJ Object_newTrue(void) +{ + Py_RETURN_TRUE; +} + +JSOBJ Object_newFalse(void) +{ + Py_RETURN_FALSE; +} + +JSOBJ Object_newNull(void) +{ + Py_RETURN_NONE; +} + +JSOBJ Object_newObject(void* decoder) +{ + return PyDict_New(); +} + +JSOBJ Object_endObject(JSOBJ obj) +{ + return obj; +} + +JSOBJ Object_newArray(void* decoder) +{ + return PyList_New(0); +} + +JSOBJ Object_endArray(JSOBJ obj) +{ + return obj; +} + +JSOBJ Object_newInteger(JSINT32 value) +{ + return PyInt_FromLong( (long) value); +} + +JSOBJ Object_newLong(JSINT64 value) +{ + return PyLong_FromLongLong (value); +} + +JSOBJ Object_newDouble(double value) +{ + return PyFloat_FromDouble(value); +} + +static void Object_releaseObject(JSOBJ obj, void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + if (obj != decoder->npyarr) + { + Py_XDECREF( ((PyObject *)obj)); + } +} + + +PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *ret; + PyObject *sarg; + JSONObjectDecoder *decoder; + PyObjectDecoder pyDecoder; + PyArray_Descr *dtype = NULL; + static char *kwlist[] = { "obj", "numpy", "labelled", "dtype", NULL}; + int numpy = 0, labelled = 0, decref = 0; + // PRINTMARK(); + + JSONObjectDecoder dec = { + Object_newString, + Object_objectAddKey, + Object_arrayAddItem, + Object_newTrue, + Object_newFalse, + Object_newNull, + Object_newObject, + Object_endObject, + Object_newArray, + Object_endArray, + Object_newInteger, + Object_newLong, + Object_newDouble, + Object_releaseObject, + PyObject_Malloc, + PyObject_Free, + PyObject_Realloc, + }; + pyDecoder.dec = dec; + pyDecoder.curdim = 0; + pyDecoder.npyarr = NULL; + + decoder = (JSONObjectDecoder*) &pyDecoder; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO&", kwlist, &sarg, &numpy, &labelled, PyArray_DescrConverter2, &dtype)) + { + return NULL; + } + + if (PyUnicode_Check(sarg)) + { + sarg = PyUnicode_AsUTF8String(sarg); + if (sarg == NULL) + { + //Exception raised above us by codec according to docs + return NULL; + } + decref = 1; + } + else + if (!PyString_Check(sarg)) + { + PyErr_Format(PyExc_TypeError, "Expected String or Unicode"); + return NULL; + } + + if (numpy) + { + pyDecoder.dtype = dtype; + decoder->newArray = Object_npyNewArray; + decoder->endArray = Object_npyEndArray; + decoder->arrayAddItem = Object_npyArrayAddItem; + + if (labelled) + { + decoder->newObject = Object_npyNewObject; + decoder->endObject = Object_npyEndObject; + decoder->objectAddKey = Object_npyObjectAddKey; + } + } + + decoder->errorStr = NULL; + decoder->errorOffset = NULL; + + PRINTMARK(); + ret = JSON_DecodeObject(decoder, PyString_AS_STRING(sarg), PyString_GET_SIZE(sarg)); + PRINTMARK(); + + if (decref) + { + Py_DECREF(sarg); + } + + if (PyErr_Occurred()) + { + return NULL; + } + + if (decoder->errorStr) + { + /*FIXME: It's possible to give a much nicer error message here with actual failing element in input etc*/ + PyErr_Format (PyExc_ValueError, "%s", decoder->errorStr); + Py_XDECREF( (PyObject *) ret); + Npy_releaseContext(pyDecoder.npyarr); + + return NULL; + } + + return ret; +} + +PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *file; + PyObject *read; + PyObject *string; + PyObject *result; + PyObject *argtuple; + + if (!PyArg_ParseTuple (args, "O", &file)) { + return NULL; + } + + if (!PyObject_HasAttrString (file, "read")) + { + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + read = PyObject_GetAttrString (file, "read"); + + if (!PyCallable_Check (read)) { + Py_XDECREF(read); + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + string = PyObject_CallObject (read, NULL); + Py_XDECREF(read); + + if (string == NULL) + { + return NULL; + } + + argtuple = PyTuple_Pack(1, string); + + result = JSONToObj (self, argtuple, kwargs); + Py_XDECREF(string); + Py_DECREF(argtuple); + + if (result == NULL) { + return NULL; + } + + return result; +} + diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c new file mode 100644 index 0000000000000..ce8bdf3721f5e --- /dev/null +++ b/pandas/src/ujson/python/objToJSON.c @@ -0,0 +1,1701 @@ +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY + +#include "py_defines.h" +#include +#include +#include +#include +#include +#include + +#define NPY_JSON_BUFSIZE 32768 + +static PyObject* cls_dataframe; +static PyObject* cls_series; +static PyObject* cls_index; + +typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, void *outValue, size_t *_outLen); + + +#if (PY_VERSION_HEX < 0x02050000) +typedef ssize_t Py_ssize_t; +#endif + +typedef struct __NpyArrContext +{ + PyObject *array; + char* dataptr; + int was_datetime64; + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) + npy_intp dim; + npy_intp stride; + npy_intp ndim; + npy_intp index[NPY_MAXDIMS]; + PyArray_GetItemFunc* getitem; + + char** rowLabels; + char** columnLabels; +} NpyArrContext; + +typedef struct __TypeContext +{ + JSPFN_ITERBEGIN iterBegin; + JSPFN_ITEREND iterEnd; + JSPFN_ITERNEXT iterNext; + JSPFN_ITERGETNAME iterGetName; + JSPFN_ITERGETVALUE iterGetValue; + PFN_PyTypeToJSON PyTypeToJSON; + PyObject *newObj; + PyObject *dictObj; + Py_ssize_t index; + Py_ssize_t size; + PyObject *itemValue; + PyObject *itemName; + PyObject *attrList; + char *citemName; + + JSINT64 longValue; + + NpyArrContext *npyarr; + int transpose; + char** rowLabels; + char** columnLabels; + npy_intp rowLabelsLen; + npy_intp columnLabelsLen; + +} TypeContext; + +typedef struct __PyObjectEncoder +{ + JSONObjectEncoder enc; + + // pass through the NpyArrContext when encoding multi-dimensional arrays + NpyArrContext* npyCtxtPassthru; + + // output format style for pandas data types + int outputFormat; + int originalOutputFormat; +} PyObjectEncoder; + +#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) + +struct PyDictIterState +{ + PyObject *keys; + size_t i; + size_t sz; +}; + +enum PANDAS_FORMAT +{ + SPLIT, + RECORDS, + INDEX, + COLUMNS, + VALUES +}; + +//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +#define PRINTMARK() + +void initObjToJSON(void) +{ + PyObject *mod_frame; + PyDateTime_IMPORT; + + mod_frame = PyImport_ImportModule("pandas.core.frame"); + if (mod_frame) + { + cls_dataframe = PyObject_GetAttrString(mod_frame, "DataFrame"); + cls_index = PyObject_GetAttrString(mod_frame, "Index"); + cls_series = PyObject_GetAttrString(mod_frame, "Series"); + Py_DECREF(mod_frame); + } + + /* Initialise numpy API */ + import_array(); +} + +static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((JSINT32 *) outValue) = PyInt_AS_LONG (obj); + return NULL; +} + +static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((JSINT64 *) outValue) = PyInt_AS_LONG (obj); + return NULL; +} + +static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + *((JSINT64 *) outValue) = GET_TC(tc)->longValue; + return NULL; +} + +static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); + return NULL; +} + +static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((double *) outValue) = PyFloat_AS_DOUBLE (obj); + return NULL; +} + +static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *_outLen = PyString_GET_SIZE(obj); + return PyString_AS_STRING(obj); +} + +static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *newObj = PyUnicode_AsUTF8String (obj); + + GET_TC(tc)->newObj = newObj; + + *_outLen = PyString_GET_SIZE(newObj); + return PyString_AS_STRING(newObj); +} + +static void *NpyDateTimeToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DATETIME)); + return NULL; +} + +static void *PyDateTimeToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + pandas_datetimestruct dts; + PyObject *obj = (PyObject *) _obj; + + dts.year = PyDateTime_GET_YEAR(obj); + dts.month = PyDateTime_GET_MONTH(obj); + dts.day = PyDateTime_GET_DAY(obj); + dts.hour = PyDateTime_DATE_GET_HOUR(obj); + dts.min = PyDateTime_DATE_GET_MINUTE(obj); + dts.sec = PyDateTime_DATE_GET_SECOND(obj); + dts.us = PyDateTime_DATE_GET_MICROSECOND(obj); + dts.ps = dts.as = 0; + *((JSINT64*)outValue) = (JSINT64) pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts); + return NULL; +} + +static void *PyDateToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + pandas_datetimestruct dts; + PyObject *obj = (PyObject *) _obj; + + dts.year = PyDateTime_GET_YEAR(obj); + dts.month = PyDateTime_GET_MONTH(obj); + dts.day = PyDateTime_GET_DAY(obj); + dts.hour = dts.min = dts.sec = dts.ps = dts.as = 0; + *((JSINT64*)outValue) = (JSINT64) pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts); + return NULL; +} + +//============================================================================= +// Numpy array iteration functions +//============================================================================= +int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) +{ + return 0; +} + +void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) +{ + PyArrayObject *obj; + PyArray_Descr *dtype; + NpyArrContext *npyarr; + + if (GET_TC(tc)->newObj) + { + obj = (PyArrayObject *) GET_TC(tc)->newObj; + } + else + { + obj = (PyArrayObject *) _obj; + } + + if (PyArray_SIZE(obj) > 0) + { + PRINTMARK(); + npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + GET_TC(tc)->npyarr = npyarr; + + if (!npyarr) + { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + + // uber hack to support datetime64[ns] arrays + if (PyArray_DESCR(obj)->type_num == NPY_DATETIME) { + npyarr->was_datetime64 = 1; + dtype = PyArray_DescrFromType(NPY_INT64); + obj = (PyArrayObject *) PyArray_CastToType(obj, dtype, 0); + } else { + npyarr->was_datetime64 = 0; + } + + npyarr->array = (PyObject*) obj; + npyarr->getitem = (PyArray_GetItemFunc*) PyArray_DESCR(obj)->f->getitem; + npyarr->dataptr = PyArray_DATA(obj); + npyarr->ndim = PyArray_NDIM(obj) - 1; + npyarr->curdim = 0; + + if (GET_TC(tc)->transpose) + { + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; + } + else + { + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; + } + + npyarr->columnLabels = GET_TC(tc)->columnLabels; + npyarr->rowLabels = GET_TC(tc)->rowLabels; + } + else + { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + } + PRINTMARK(); +} + +void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + + if (npyarr) + { + if (npyarr->was_datetime64) { + Py_XDECREF(npyarr->array); + } + + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + } + GET_TC(tc)->itemValue = NULL; + + PyObject_Free(npyarr); + } + PRINTMARK(); +} + +void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); +} + +void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + PRINTMARK(); + // finished this dimension, reset the data pointer + npyarr = GET_TC(tc)->npyarr; + npyarr->curdim--; + npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; + npyarr->stridedim -= npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->dataptr += npyarr->stride; + + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } +} + +int NpyArr_iterNextItem(JSOBJ _obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; + + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) + { + return 0; + } + + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + + npyarr->dataptr += npyarr->stride; + npyarr->index[npyarr->stridedim]++; + return 1; +} + +int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; + + if (npyarr->curdim >= npyarr->ndim || npyarr->index[npyarr->stridedim] >= npyarr->dim) + { + // innermost dimension, start retrieving item values + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + return NpyArr_iterNextItem(_obj, tc); + } + + // dig a dimension deeper + npyarr->index[npyarr->stridedim]++; + + npyarr->curdim++; + npyarr->stridedim += npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->index[npyarr->stridedim] = 0; + + ((PyObjectEncoder*) tc->encoder)->npyCtxtPassthru = npyarr; + GET_TC(tc)->itemValue = npyarr->array; + return 1; +} + +JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + return GET_TC(tc)->itemValue; +} + +char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + NpyArrContext* npyarr; + npy_intp idx; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) + { + idx = npyarr->index[npyarr->stridedim] - 1; + *outLen = strlen(npyarr->columnLabels[idx]); + return npyarr->columnLabels[idx]; + } + else + { + idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; + *outLen = strlen(npyarr->rowLabels[idx]); + return npyarr->rowLabels[idx]; + } +} + +//============================================================================= +// Tuple iteration functions +// itemValue is borrowed reference, no ref counting +//============================================================================= +void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyTuple_GET_SIZE( (PyObject *) obj); + GET_TC(tc)->itemValue = NULL; +} + +int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + PyObject *item; + + if (GET_TC(tc)->index >= GET_TC(tc)->size) + { + return 0; + } + + item = PyTuple_GET_ITEM (obj, GET_TC(tc)->index); + + GET_TC(tc)->itemValue = item; + GET_TC(tc)->index ++; + return 1; +} + +void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ +} + +JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// Dir iteration functions +// itemName ref is borrowed from PyObject_Dir (attrList). No refcount +// itemValue ref is from PyObject_GetAttr. Ref counted +//============================================================================= +void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->attrList = PyObject_Dir(obj); + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); + PRINTMARK(); +} + +void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + + Py_DECREF( (PyObject *) GET_TC(tc)->attrList); + PRINTMARK(); +} + +int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *itemValue = GET_TC(tc)->itemValue; + PyObject *itemName = GET_TC(tc)->itemName; + PyObject* attr; + PyObject* attrName; + char* attrStr; + + + if (itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = itemValue = NULL; + } + + if (itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = itemName = NULL; + } + + for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index ++) + { + attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); +#if PY_MAJOR_VERSION >= 3 + attr = PyUnicode_AsUTF8String(attrName); +#else + attr = attrName; + Py_INCREF(attr); +#endif + attrStr = PyString_AS_STRING(attr); + + if (attrStr[0] == '_') + { + PRINTMARK(); + Py_DECREF(attr); + continue; + } + + itemValue = PyObject_GetAttr(obj, attrName); + if (itemValue == NULL) + { + PyErr_Clear(); + Py_DECREF(attr); + PRINTMARK(); + continue; + } + + if (PyCallable_Check(itemValue)) + { + Py_DECREF(itemValue); + Py_DECREF(attr); + PRINTMARK(); + continue; + } + + PRINTMARK(); + itemName = attr; + break; + } + + if (itemName == NULL) + { + GET_TC(tc)->index = GET_TC(tc)->size; + GET_TC(tc)->itemValue = NULL; + return 0; + } + + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->index ++; + + PRINTMARK(); + return 1; +} + + + +JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + return GET_TC(tc)->itemValue; +} + +char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + PRINTMARK(); + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); +} + + + + +//============================================================================= +// List iteration functions +// itemValue is borrowed from object (which is list). No refcounting +//============================================================================= +void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE( (PyObject *) obj); +} + +int List_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->index >= GET_TC(tc)->size) + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->itemValue = PyList_GET_ITEM (obj, GET_TC(tc)->index); + GET_TC(tc)->index ++; + return 1; +} + +void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ +} + +JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// pandas Index iteration functions +//============================================================================= +void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->citemName) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "name", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->citemName) + { + PyObject_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// pandas Series iteration functions +//============================================================================= +void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->citemName) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "name", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "index", sizeof(char)*6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } + else + if (index == 2) + { + memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + enc->outputFormat = enc->originalOutputFormat; + if (GET_TC(tc)->citemName) + { + PyObject_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// pandas DataFrame iteration functions +//============================================================================= +void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series & index + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->citemName) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "columns", sizeof(char)*8); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "index", sizeof(char)*6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } + else + if (index == 2) + { + memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + enc->outputFormat = enc->originalOutputFormat; + if (GET_TC(tc)->citemName) + { + PyObject_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// Dict iteration functions +// itemName might converted to string (Python_Str). Do refCounting +// itemValue is borrowed from object (which is dict). No refCounting +//============================================================================= +void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + PRINTMARK(); +} + +int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ +#if PY_MAJOR_VERSION >= 3 + PyObject* itemNameTmp; +#endif + + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + + + if (!PyDict_Next ( (PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) + { + PRINTMARK(); + return 0; + } + + if (PyUnicode_Check(GET_TC(tc)->itemName)) + { + GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName); + } + else + if (!PyString_Check(GET_TC(tc)->itemName)) + { + GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); +#if PY_MAJOR_VERSION >= 3 + itemNameTmp = GET_TC(tc)->itemName; + GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName); + Py_DECREF(itemNameTmp); +#endif + } + else + { + Py_INCREF(GET_TC(tc)->itemName); + } + PRINTMARK(); + return 1; +} + +void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + Py_DECREF(GET_TC(tc)->dictObj); + PRINTMARK(); +} + +JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); +} + +void NpyArr_freeLabels(char** labels, npy_intp len) +{ + npy_intp i; + + if (labels) + { + for (i = 0; i < len; i++) + { + PyObject_Free(labels[i]); + } + PyObject_Free(labels); + } +} + +char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_intp num) +{ + // NOTE this function steals a reference to labels. + PyArray_Descr *dtype = NULL; + PyArrayObject* labelsTmp = NULL; + PyObject* item = NULL; + npy_intp i, stride, len; + // npy_intp bufsize = 32768; + char** ret; + char *dataptr, *cLabel, *origend, *origst, *origoffset; + char labelBuffer[NPY_JSON_BUFSIZE]; + PyArray_GetItemFunc* getitem; + PRINTMARK(); + + if (PyArray_SIZE(labels) < num) + { + PyErr_SetString(PyExc_ValueError, "Label array sizes do not match corresponding data shape"); + Py_DECREF(labels); + return 0; + } + + ret = PyObject_Malloc(sizeof(char*)*num); + if (!ret) + { + PyErr_NoMemory(); + Py_DECREF(labels); + return 0; + } + + for (i = 0; i < num; i++) + { + ret[i] = NULL; + } + + origst = enc->start; + origend = enc->end; + origoffset = enc->offset; + + if (PyArray_DESCR(labels)->type_num == NPY_DATETIME) { + dtype = PyArray_DescrFromType(NPY_INT64); + labelsTmp = labels; + labels = (PyArrayObject *) PyArray_CastToType(labels, dtype, 0); + Py_DECREF(labelsTmp); + } + + stride = PyArray_STRIDE(labels, 0); + dataptr = PyArray_DATA(labels); + getitem = PyArray_DESCR(labels)->f->getitem; + + for (i = 0; i < num; i++) + { + item = getitem(dataptr, labels); + if (!item) + { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); + Py_DECREF(item); + + if (PyErr_Occurred() || enc->errorMsg) + { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + // trim off any quotes surrounding the result + if (*cLabel == '\"') + { + cLabel++; + enc->offset -= 2; + *(enc->offset) = '\0'; + } + + len = enc->offset - cLabel + 1; + ret[i] = PyObject_Malloc(sizeof(char)*len); + + if (!ret[i]) + { + PyErr_NoMemory(); + ret = 0; + break; + } + + memcpy(ret[i], cLabel, sizeof(char)*len); + dataptr += stride; + } + + enc->start = origst; + enc->end = origend; + enc->offset = origoffset; + + Py_DECREF(labels); + return ret; +} + +void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) +{ + PyObject *obj, *exc, *toDictFunc; + TypeContext *pc; + PyObjectEncoder *enc; + double val; + PRINTMARK(); + if (!_obj) { + tc->type = JT_INVALID; + return; + } + + obj = (PyObject*) _obj; + enc = (PyObjectEncoder*) tc->encoder; + + tc->prv = PyObject_Malloc(sizeof(TypeContext)); + pc = (TypeContext *) tc->prv; + if (!pc) + { + tc->type = JT_INVALID; + PyErr_NoMemory(); + return; + } + pc->newObj = NULL; + pc->dictObj = NULL; + pc->itemValue = NULL; + pc->itemName = NULL; + pc->attrList = NULL; + pc->citemName = NULL; + pc->npyarr = NULL; + pc->rowLabels = NULL; + pc->columnLabels = NULL; + pc->index = 0; + pc->size = 0; + pc->longValue = 0; + pc->transpose = 0; + pc->rowLabelsLen = 0; + pc->columnLabelsLen = 0; + + if (PyIter_Check(obj) || PyArray_Check(obj)) + { + goto ISITERABLE; + } + + if (PyBool_Check(obj)) + { + PRINTMARK(); + tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; + return; + } + else + if (PyLong_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + GET_TC(tc)->longValue = PyLong_AsLongLong(obj); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { + PRINTMARK(); + goto INVALID; + } + + return; + } + else + if (PyInt_Check(obj)) + { + PRINTMARK(); +#ifdef _LP64 + pc->PyTypeToJSON = PyIntToINT64; tc->type = JT_LONG; +#else + pc->PyTypeToJSON = PyIntToINT32; tc->type = JT_INT; +#endif + return; + } + else + if (PyArray_IsScalar(obj, Integer)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_INT64)); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { + PRINTMARK(); + goto INVALID; + } + + return; + } + else + if (PyString_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyStringToUTF8; tc->type = JT_UTF8; + return; + } + else + if (PyUnicode_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyUnicodeToUTF8; tc->type = JT_UTF8; + return; + } + else + if (PyFloat_Check(obj)) + { + PRINTMARK(); + val = PyFloat_AS_DOUBLE (obj); + if (npy_isnan(val) || npy_isinf(val)) + { + tc->type = JT_NULL; + } + else + { + pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; + } + return; + } + else + if (PyArray_IsScalar(obj, Float)) + { + PRINTMARK(); + pc->PyTypeToJSON = NpyFloatToDOUBLE; tc->type = JT_DOUBLE; + return; + } + else + if (PyArray_IsScalar(obj, Datetime)) + { + PRINTMARK(); + pc->PyTypeToJSON = NpyDateTimeToINT64; tc->type = JT_LONG; + return; + } + else + if (PyDateTime_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyDateTimeToINT64; tc->type = JT_LONG; + return; + } + else + if (PyDate_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyDateToINT64; tc->type = JT_LONG; + return; + } + else + if (obj == Py_None) + { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + + +ISITERABLE: + + if (PyDict_Check(obj)) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = obj; + Py_INCREF(obj); + + return; + } + else + if (PyList_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = List_iterBegin; + pc->iterEnd = List_iterEnd; + pc->iterNext = List_iterNext; + pc->iterGetValue = List_iterGetValue; + pc->iterGetName = List_iterGetName; + return; + } + else + if (PyTuple_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Tuple_iterBegin; + pc->iterEnd = Tuple_iterEnd; + pc->iterNext = Tuple_iterNext; + pc->iterGetValue = Tuple_iterGetValue; + pc->iterGetName = Tuple_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_index)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Index_iterBegin; + pc->iterEnd = Index_iterEnd; + pc->iterNext = Index_iterNext; + pc->iterGetValue = Index_iterGetValue; + pc->iterGetName = Index_iterGetName; + return; + } + + PRINTMARK(); + tc->type = JT_ARRAY; + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_series)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Series_iterBegin; + pc->iterEnd = Series_iterEnd; + pc->iterNext = Series_iterNext; + pc->iterGetValue = Series_iterGetValue; + pc->iterGetName = Series_iterGetName; + return; + } + + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->columnLabelsLen = PyArray_SIZE(obj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + goto INVALID; + } + } + else + { + PRINTMARK(); + tc->type = JT_ARRAY; + } + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyArray_Check(obj)) + { + if (enc->npyCtxtPassthru) + { + PRINTMARK(); + pc->npyarr = enc->npyCtxtPassthru; + tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); + pc->iterBegin = NpyArrPassThru_iterBegin; + pc->iterEnd = NpyArrPassThru_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + enc->npyCtxtPassthru = NULL; + return; + } + + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_dataframe)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; + return; + } + + PRINTMARK(); + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + if (enc->outputFormat == VALUES) + { + PRINTMARK(); + tc->type = JT_ARRAY; + } + else + if (enc->outputFormat == RECORDS) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + goto INVALID; + } + } + else + if (enc->outputFormat == INDEX) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->rowLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + if (!pc->rowLabels) + { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + } + else + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->rowLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + if (!pc->rowLabels) + { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + pc->transpose = 1; + } + return; + } + + + toDictFunc = PyObject_GetAttrString(obj, "toDict"); + + if (toDictFunc) + { + PyObject* tuple = PyTuple_New(0); + PyObject* toDictResult = PyObject_Call(toDictFunc, tuple, NULL); + Py_DECREF(tuple); + Py_DECREF(toDictFunc); + + if (toDictResult == NULL) + { + PyErr_Clear(); + tc->type = JT_NULL; + return; + } + + if (!PyDict_Check(toDictResult)) + { + Py_DECREF(toDictResult); + tc->type = JT_NULL; + return; + } + + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = toDictResult; + return; + } + + PyErr_Clear(); + + tc->type = JT_OBJECT; + pc->iterBegin = Dir_iterBegin; + pc->iterEnd = Dir_iterEnd; + pc->iterNext = Dir_iterNext; + pc->iterGetValue = Dir_iterGetValue; + pc->iterGetName = Dir_iterGetName; + + return; + +INVALID: + tc->type = JT_INVALID; + PyObject_Free(tc->prv); + tc->prv = NULL; + return; +} + + +void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_XDECREF(GET_TC(tc)->newObj); + NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); + NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); + + PyObject_Free(tc->prv); + tc->prv = NULL; +} + +const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) +{ + return GET_TC(tc)->PyTypeToJSON (obj, tc, NULL, _outLen); +} + +JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) +{ + JSINT64 ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + + return ret; +} + +JSINT32 Object_getIntValue(JSOBJ obj, JSONTypeContext *tc) +{ + JSINT32 ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + + +double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) +{ + double ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + +static void Object_releaseObject(JSOBJ _obj) +{ + Py_DECREF( (PyObject *) _obj); +} + + + +void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->iterBegin(obj, tc); +} + +int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->iterNext(obj, tc); +} + +void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->iterEnd(obj, tc); +} + +JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->iterGetValue(obj, tc); +} + +char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return GET_TC(tc)->iterGetName(obj, tc, outLen); +} + + +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "orient", NULL}; + + char buffer[65536]; + char *ret; + PyObject *newobj; + PyObject *oinput = NULL; + PyObject *oensureAscii = NULL; + char *sOrient = NULL; + int idoublePrecision = 5; // default double precision setting + + PyObjectEncoder pyEncoder = + { + { + Object_beginTypeContext, //void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); + Object_endTypeContext, //void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); + Object_getStringValue, //const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + Object_getLongValue, //JSLONG (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + Object_getIntValue, //JSLONG (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + Object_getDoubleValue, //double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + Object_iterBegin, //JSPFN_ITERBEGIN iterBegin; + Object_iterNext, //JSPFN_ITERNEXT iterNext; + Object_iterEnd, //JSPFN_ITEREND iterEnd; + Object_iterGetValue, //JSPFN_ITERGETVALUE iterGetValue; + Object_iterGetName, //JSPFN_ITERGETNAME iterGetName; + Object_releaseObject, //void (*releaseValue)(JSONTypeContext *ti); + PyObject_Malloc, //JSPFN_MALLOC malloc; + PyObject_Realloc, //JSPFN_REALLOC realloc; + PyObject_Free, //JSPFN_FREE free; + -1, //recursionMax + idoublePrecision, + 1, //forceAscii + } + }; + JSONObjectEncoder* encoder = (JSONObjectEncoder*) &pyEncoder; + + pyEncoder.npyCtxtPassthru = NULL; + pyEncoder.outputFormat = COLUMNS; + + PRINTMARK(); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Ois", kwlist, &oinput, &oensureAscii, &idoublePrecision, &sOrient)) + { + return NULL; + } + + if (sOrient != NULL) + { + if (strcmp(sOrient, "records") == 0) + { + pyEncoder.outputFormat = RECORDS; + } + else + if (strcmp(sOrient, "index") == 0) + { + pyEncoder.outputFormat = INDEX; + } + else + if (strcmp(sOrient, "split") == 0) + { + pyEncoder.outputFormat = SPLIT; + } + else + if (strcmp(sOrient, "values") == 0) + { + pyEncoder.outputFormat = VALUES; + } + else + if (strcmp(sOrient, "columns") != 0) + { + PyErr_Format (PyExc_ValueError, "Invalid value '%s' for option 'orient'", sOrient); + return NULL; + } + } + + pyEncoder.originalOutputFormat = pyEncoder.outputFormat; + + if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) + { + encoder->forceASCII = 0; + } + + encoder->doublePrecision = idoublePrecision; + + PRINTMARK(); + ret = JSON_EncodeObject (oinput, encoder, buffer, sizeof (buffer)); + PRINTMARK(); + + if (PyErr_Occurred()) + { + return NULL; + } + + if (encoder->errorMsg) + { + if (ret != buffer) + { + encoder->free (ret); + } + + PyErr_Format (PyExc_OverflowError, "%s", encoder->errorMsg); + return NULL; + } + + newobj = PyString_FromString (ret); + + if (ret != buffer) + { + encoder->free (ret); + } + + PRINTMARK(); + + return newobj; +} + +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *data; + PyObject *file; + PyObject *string; + PyObject *write; + PyObject *argtuple; + + PRINTMARK(); + + if (!PyArg_ParseTuple (args, "OO", &data, &file)) { + return NULL; + } + + if (!PyObject_HasAttrString (file, "write")) + { + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + write = PyObject_GetAttrString (file, "write"); + + if (!PyCallable_Check (write)) { + Py_XDECREF(write); + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + argtuple = PyTuple_Pack(1, data); + + string = objToJSON (self, argtuple, kwargs); + + if (string == NULL) + { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } + + Py_XDECREF(argtuple); + + argtuple = PyTuple_Pack (1, string); + if (argtuple == NULL) + { + Py_XDECREF(write); + return NULL; + } + if (PyObject_CallObject (write, argtuple) == NULL) + { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } + + Py_XDECREF(write); + Py_DECREF(argtuple); + Py_XDECREF(string); + + PRINTMARK(); + + Py_RETURN_NONE; + + +} + diff --git a/pandas/src/ujson/python/ujson.c b/pandas/src/ujson/python/ujson.c new file mode 100644 index 0000000000000..e04309e620a1d --- /dev/null +++ b/pandas/src/ujson/python/ujson.c @@ -0,0 +1,73 @@ +#include "py_defines.h" +#include "version.h" + +/* objToJSON */ +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs); +void initObjToJSON(void); + +/* JSONToObj */ +PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs); + +/* objToJSONFile */ +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs); + +/* JSONFileToObj */ +PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs); + + +static PyMethodDef ujsonMethods[] = { + {"encode", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8. Pass in double_precision to alter the maximum digit precision with doubles"}, + {"decode", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure"}, + {"dumps", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"}, + {"loads", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure"}, + {"dump", (PyCFunction) objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. Use ensure_ascii=false to output UTF-8"}, + {"load", (PyCFunction) JSONFileToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as file to dict object structure"}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + +#if PY_MAJOR_VERSION >= 3 + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "_pandasujson", + 0, /* m_doc */ + -1, /* m_size */ + ujsonMethods, /* m_methods */ + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL /* m_free */ +}; + +#define PYMODINITFUNC PyObject *PyInit_json(void) +#define PYMODULE_CREATE() PyModule_Create(&moduledef) +#define MODINITERROR return NULL + +#else + +#define PYMODINITFUNC PyMODINIT_FUNC initjson(void) +#define PYMODULE_CREATE() Py_InitModule("json", ujsonMethods) +#define MODINITERROR return + +#endif + +PYMODINITFUNC +{ + PyObject *module; + PyObject *version_string; + + initObjToJSON(); + module = PYMODULE_CREATE(); + + if (module == NULL) + { + MODINITERROR; + } + + version_string = PyString_FromString (UJSON_VERSION); + PyModule_AddObject (module, "__version__", version_string); + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif +} diff --git a/setup.py b/setup.py index 1cc666c87404b..ff40738ddfb78 100755 --- a/setup.py +++ b/setup.py @@ -244,7 +244,13 @@ def initialize_options(self): 'np_datetime_strings.c', 'period.c', 'tokenizer.c', - 'io.c'] + 'io.c', + 'ujson.c', + 'objToJSON.c', + 'JSONtoObj.c', + 'ultrajsonenc.c', + 'ultrajsondec.c', + ] for root, dirs, files in list(os.walk('pandas')): for f in files: @@ -472,7 +478,8 @@ def pxd(name): 'pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c'], include_dirs=['pandas/src/ujson/python', - 'pandas/src/ujson/lib'] + common_include) + 'pandas/src/ujson/lib', + 'pandas/src/datetime'] + common_include) extensions.append(ujson_ext) From 7dd12cce711ffc478b69ebee2e8fa013d34ba746 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 7 Jun 2013 18:42:56 -0400 Subject: [PATCH 05/10] CLN: revised json support to use the to_json/read_json in pandas.io.json DOC: docs in io.rst/whatsnew/release notes/api TST: cleaned up cruft in test_series/test_frame --- doc/source/api.rst | 11 ++ doc/source/io.rst | 27 +++- doc/source/v0.11.1.txt | 6 + pandas/core/frame.py | 100 --------------- pandas/core/generic.py | 32 +++++ pandas/core/series.py | 82 ------------ pandas/io/api.py | 1 + pandas/io/json.py | 152 +++++++++++++++++++++++ pandas/io/tests/test_json/test_pandas.py | 42 +++++-- pandas/tests/test_frame.py | 140 --------------------- pandas/tests/test_series.py | 56 --------- 11 files changed, 257 insertions(+), 392 deletions(-) create mode 100644 pandas/io/json.py diff --git a/doc/source/api.rst b/doc/source/api.rst index e263554460380..bb6f0ac073e21 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -45,6 +45,16 @@ Excel read_excel ExcelFile.parse +JSON +~~~~ + +.. currentmodule:: pandas.io.json + +.. autosummary:: + :toctree: generated/ + + read_json + HTML ~~~~ @@ -597,6 +607,7 @@ Serialization / IO / Conversion DataFrame.to_hdf DataFrame.to_dict DataFrame.to_excel + DataFrame.to_json DataFrame.to_html DataFrame.to_stata DataFrame.to_records diff --git a/doc/source/io.rst b/doc/source/io.rst index ac5d49e036669..625ff39cd7eba 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -35,6 +35,7 @@ object. * ``read_excel`` * ``read_hdf`` * ``read_sql`` + * ``read_json`` * ``read_html`` * ``read_stata`` * ``read_clipboard`` @@ -45,6 +46,7 @@ The corresponding ``writer`` functions are object methods that are accessed like * ``to_excel`` * ``to_hdf`` * ``to_sql`` + * ``to_json`` * ``to_html`` * ``to_stata`` * ``to_clipboard`` @@ -937,6 +939,30 @@ The Series object also has a ``to_string`` method, but with only the ``buf``, which, if set to ``True``, will additionally output the length of the Series. + +JSON +---- + +Read and write ``JSON`` format files. + +.. _io.json: + +Writing JSON +~~~~~~~~~~~~ + +.. ipython:: python + + df = DataFrame(randn(10, 2), columns=list('AB')) + s = df.to_json() + s + +Reading JSON +~~~~~~~~~~~~ + +.. ipython:: python + + pd.read_json(s) + HTML ---- @@ -2193,7 +2219,6 @@ into a .dta file. The format version of this file is always the latest one, 115. .. ipython:: python - from pandas.io.stata import StataWriter df = DataFrame(randn(10, 2), columns=list('AB')) df.to_stata('stata.dta') diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 70d840f8c477a..5045f73375a97 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -16,6 +16,7 @@ API changes * ``read_excel`` * ``read_hdf`` * ``read_sql`` + * ``read_json`` * ``read_html`` * ``read_stata`` * ``read_clipboard`` @@ -26,6 +27,7 @@ API changes * ``to_excel`` * ``to_hdf`` * ``to_sql`` + * ``to_json`` * ``to_html`` * ``to_stata`` * ``to_clipboard`` @@ -175,6 +177,10 @@ Enhancements accessable via ``read_stata`` top-level function for reading, and ``to_stata`` DataFrame method for writing, :ref:`See the docs` + - Added module for reading and writing json format files: ``pandas.io.json`` + accessable via ``read_json`` top-level function for reading, + and ``to_json`` DataFrame method for writing, :ref:`See the docs` + - ``DataFrame.replace()`` now allows regular expressions on contained ``Series`` with object dtype. See the examples section in the regular docs :ref:`Replacing via String Expression ` diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2925bb3e3b73a..9c0a2843370f4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5593,106 +5593,6 @@ def mask(self, cond): """ return self.where(~cond, NA) - -@classmethod -def from_json(cls, json, orient="columns", dtype=None, numpy=True): - """ - Convert JSON string to DataFrame - - Parameters - ---------- - json : The JSON string to parse. - orient : {'split', 'records', 'index', 'columns', 'values'}, - default 'columns' - The format of the JSON string - split : dict like - {index -> [index], columns -> [columns], data -> [values]} - records : list like [{column -> value}, ... , {column -> value}] - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array - dtype : dtype of the resulting DataFrame - nupmpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. - - Returns - ------- - result : DataFrame - """ - from pandas.json import loads - - df = None - - if dtype is not None and orient == "split": - numpy = False - - if numpy: - try: - if orient == "columns": - args = loads(json, dtype=dtype, numpy=True, labelled=True) - if args: - args = (args[0].T, args[2], args[1]) - df = DataFrame(*args) - elif orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - df = DataFrame(**decoded) - elif orient == "values": - df = DataFrame(loads(json, dtype=dtype, numpy=True)) - else: - df = DataFrame(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - except ValueError: - numpy = False - if not numpy: - if orient == "columns": - df = DataFrame(loads(json), dtype=dtype) - elif orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - df = DataFrame(dtype=dtype, **decoded) - elif orient == "index": - df = DataFrame(loads(json), dtype=dtype).T - else: - df = DataFrame(loads(json), dtype=dtype) - - return df -DataFrame.from_json = from_json - - -def to_json(self, orient="columns", double_precision=10, - force_ascii=True): - """ - Convert DataFrame to a JSON string. - - Note NaN's and None will be converted to null and datetime objects - will be converted to UNIX timestamps. - - Parameters - ---------- - orient : {'split', 'records', 'index', 'columns', 'values'}, - default 'columns' - The format of the JSON string - split : dict like - {index -> [index], columns -> [columns], data -> [values]} - records : list like [{column -> value}, ... , {column -> value}] - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array - double_precision : The number of decimal places to use when encoding - floating point values, default 10. - force_ascii : force encoded string to be ASCII, default True. - - Returns - ------- - result : JSON compatible string - """ - from pandas.json import dumps - return dumps(self, orient=orient, double_precision=double_precision, - ensure_ascii=force_ascii) -DataFrame.to_json = to_json - - _EMPTY_SERIES = Series([]) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5533584745167..7a947f9b4f96b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -495,6 +495,38 @@ def to_clipboard(self): from pandas.io import clipboard clipboard.to_clipboard(self) + def to_json(self, orient=None, double_precision=10, + force_ascii=True): + """ + Convert the object to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index', 'columns', 'values'}, + default is 'index' for Series, 'columns' for DataFrame + + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas.io import json + return json.to_json(self, orient=orient, double_precision=double_precision, + force_ascii=force_ascii) + # install the indexerse for _name, _indexer in indexing.get_indexers_list(): PandasObject._create_indexer(_name,_indexer) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9147e64f5b11a..3a7a7d0f49b66 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3298,88 +3298,6 @@ def str(self): from pandas.core.strings import StringMethods return StringMethods(self) - -@classmethod -def from_json(cls, json, orient="index", dtype=None, numpy=True): - """ - Convert JSON string to Series - - Parameters - ---------- - json : The JSON string to parse. - orient : {'split', 'records', 'index'}, default 'index' - The format of the JSON string - split : dict like - {index -> [index], name -> name, data -> [values]} - records : list like [value, ... , value] - index : dict like {index -> value} - dtype : dtype of the resulting Series - nupmpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. - - Returns - ------- - result : Series - """ - from pandas.json import loads - s = None - - if dtype is not None and orient == "split": - numpy = False - - if numpy: - try: - if orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - s = Series(**decoded) - elif orient == "columns" or orient == "index": - s = Series(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - else: - s = Series(loads(json, dtype=dtype, numpy=True)) - except ValueError: - numpy = False - if not numpy: - if orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - s = Series(dtype=dtype, **decoded) - else: - s = Series(loads(json), dtype=dtype) - - return s -Series.from_json = from_json - -def to_json(self, orient="index", double_precision=10, force_ascii=True): - """ - Convert Series to a JSON string - - Note NaN's and None will be converted to null and datetime objects - will be converted to UNIX timestamps. - - Parameters - ---------- - orient : {'split', 'records', 'index'}, default 'index' - The format of the JSON string - split : dict like - {index -> [index], name -> name, data -> [values]} - records : list like [value, ... , value] - index : dict like {index -> value} - double_precision : The number of decimal places to use when encoding - floating point values, default 10. - force_ascii : force encoded string to be ASCII, default True. - - Returns - ------- - result : JSON compatible string - """ - from pandas.json import dumps - return dumps(self, orient=orient, double_precision=double_precision, - ensure_ascii=force_ascii) -Series.to_json = to_json - - _INDEX_TYPES = ndarray, Index, list, tuple #------------------------------------------------------------------------------ diff --git a/pandas/io/api.py b/pandas/io/api.py index f17351921f83f..48566399f9bfe 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -6,6 +6,7 @@ from pandas.io.clipboard import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel from pandas.io.pytables import HDFStore, Term, get_store, read_hdf +from pandas.io.json import read_json from pandas.io.html import read_html from pandas.io.sql import read_sql from pandas.io.stata import read_stata diff --git a/pandas/io/json.py b/pandas/io/json.py new file mode 100644 index 0000000000000..7c8f6f40bfd4e --- /dev/null +++ b/pandas/io/json.py @@ -0,0 +1,152 @@ + +# pylint: disable-msg=E1101,W0613,W0603 +from pandas import Series, DataFrame + +import pandas.json as _json +loads = _json.loads +dumps = _json.dumps + +### interface to/from ### + +def to_json(obj, orient=None, double_precision=10, + force_ascii=True): + """ + Convert the object to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index', 'columns', 'values'}, + default is 'index' for Series, 'columns' for DataFrame + + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + if orient is None: + if isinstance(obj, Series): + orient = 'index' + elif isinstance(obj, DataFrame): + orient = 'columns' + + return dumps(obj, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) + +def read_json(json, typ='frame', orient=None, dtype=None, numpy=True): + """ + Convert JSON string to pandas object + + Parameters + ---------- + json : The JSON string to parse. + typ : type of object to recover (series or frame), default 'frame' + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + dtype : dtype of the resulting Series + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : Series or DataFrame + """ + + obj = None + if typ == 'frame': + if orient is None: + orient = 'columns' + obj = load_frame(json, orient, dtype, numpy) + + if typ == 'series' or obj is None: + if orient == 'columns': + orient = 'index' + obj = load_series(json, orient, dtype, numpy) + + return obj + +def load_series(json, orient, dtype, numpy): + s = None + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + s = Series(**decoded) + elif orient == "columns" or orient == "index": + s = Series(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + else: + s = Series(loads(json, dtype=dtype, numpy=True)) + except ValueError: + numpy = False + + if not numpy: + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + s = Series(dtype=dtype, **decoded) + else: + s = Series(loads(json), dtype=dtype) + + return s + + +def load_frame(json, orient, dtype, numpy): + """ try to recover a frame, return None if we didn't get anything """ + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "columns": + args = loads(json, dtype=dtype, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + df = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + df = DataFrame(**decoded) + elif orient == "values": + df = DataFrame(loads(json, dtype=dtype, numpy=True)) + else: + df = DataFrame(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + except ValueError: + numpy = False + + if not numpy: + if orient == "columns": + df = DataFrame(loads(json), dtype=dtype) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + df = DataFrame(dtype=dtype, **decoded) + elif orient == "index": + df = DataFrame(loads(json), dtype=dtype).T + else: + df = DataFrame(loads(json), dtype=dtype) + + return df diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 506aa382487d6..f4cb7ed03c026 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -1,3 +1,4 @@ + # pylint: disable-msg=W0612,E1101 from copy import deepcopy from datetime import datetime, timedelta @@ -11,6 +12,7 @@ from pandas import Series, DataFrame, DatetimeIndex import pandas as pd +read_json = pd.read_json from pandas.util.testing import (assert_almost_equal, assert_frame_equal, assert_series_equal) @@ -55,8 +57,8 @@ def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=True): df = df.sort() dfjson = df.to_json(orient=orient) - unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy) + unser = read_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy) unser = unser.sort() if df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex(unser.index.values.astype('i8')) @@ -136,50 +138,50 @@ def _check_all_orients(df, dtype=None): _check_orient(df.transpose().transpose(), "index") def test_frame_from_json_bad_data(self): - self.assertRaises(ValueError, DataFrame.from_json, '{"key":b:a:d}') + self.assertRaises(ValueError, read_json, '{"key":b:a:d}') # too few indices json = ('{"columns":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') - self.assertRaises(ValueError, DataFrame.from_json, json, + self.assertRaises(ValueError, read_json, json, orient="split") # too many columns json = ('{"columns":["A","B","C"],' '"index":["1","2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') - self.assertRaises(AssertionError, DataFrame.from_json, json, + self.assertRaises(AssertionError, read_json, json, orient="split") # bad key json = ('{"badkey":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') - self.assertRaises(TypeError, DataFrame.from_json, json, + self.assertRaises(TypeError, read_json, json, orient="split") def test_frame_from_json_nones(self): df = DataFrame([[1, 2], [4, 5, 6]]) - unser = DataFrame.from_json(df.to_json()) + unser = read_json(df.to_json()) self.assert_(np.isnan(unser['2'][0])) df = DataFrame([['1', '2'], ['4', '5', '6']]) - unser = DataFrame.from_json(df.to_json()) + unser = read_json(df.to_json()) self.assert_(unser['2'][0] is None) - unser = DataFrame.from_json(df.to_json(), numpy=False) + unser = read_json(df.to_json(), numpy=False) self.assert_(unser['2'][0] is None) # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) df[2][0] = np.inf - unser = DataFrame.from_json(df.to_json()) + unser = read_json(df.to_json()) self.assert_(np.isnan(unser['2'][0])) df[2][0] = np.NINF - unser = DataFrame.from_json(df.to_json()) + unser = read_json(df.to_json()) self.assert_(np.isnan(unser['2'][0])) def test_frame_to_json_except(self): @@ -190,8 +192,8 @@ def test_series_from_json_to_json(self): def _check_orient(series, orient, dtype=None, numpy=True): series = series.sort_index() - unser = Series.from_json(series.to_json(orient=orient), - orient=orient, numpy=numpy, dtype=dtype) + unser = read_json(series.to_json(orient=orient), typ='series', + orient=orient, numpy=numpy, dtype=dtype) unser = unser.sort_index() if series.index.dtype.type == np.datetime64: unser.index = DatetimeIndex(unser.index.values.astype('i8')) @@ -238,3 +240,17 @@ def _check_all_orients(series, dtype=None): def test_series_to_json_except(self): s = Series([1, 2, 3]) self.assertRaises(ValueError, s.to_json, orient="garbage") + + def test_typ(self): + + s = Series(range(6), index=['a','b','c','d','e','f']) + result = read_json(s.to_json(),typ=None) + assert_series_equal(result,s) + + def test_reconstruction_index(self): + + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + result = read_json(df.to_json()) + + # the index is serialized as strings....correct? + #assert_frame_equal(result,df) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d674a2f44ebe1..2c6d3b221c6ff 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3338,146 +3338,6 @@ def test_to_dict(self): for k2, v2 in v.iteritems(): self.assertEqual(v2, recons_data[k][k2]) - def test_from_json_to_json(self): - raise nose.SkipTest - - def _check_orient(df, orient, dtype=None, numpy=True): - df = df.sort() - dfjson = df.to_json(orient=orient) - unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy) - unser = unser.sort() - if df.index.dtype.type == np.datetime64: - unser.index = DatetimeIndex(unser.index.values.astype('i8')) - if orient == "records": - # index is not captured in this orientation - assert_almost_equal(df.values, unser.values) - self.assert_(df.columns.equals(unser.columns)) - elif orient == "values": - # index and cols are not captured in this orientation - assert_almost_equal(df.values, unser.values) - elif orient == "split": - # index and col labels might not be strings - unser.index = [str(i) for i in unser.index] - unser.columns = [str(i) for i in unser.columns] - unser = unser.sort() - assert_almost_equal(df.values, unser.values) - else: - assert_frame_equal(df, unser) - - def _check_all_orients(df, dtype=None): - _check_orient(df, "columns", dtype=dtype) - _check_orient(df, "records", dtype=dtype) - _check_orient(df, "split", dtype=dtype) - _check_orient(df, "index", dtype=dtype) - _check_orient(df, "values", dtype=dtype) - - _check_orient(df, "columns", dtype=dtype, numpy=False) - _check_orient(df, "records", dtype=dtype, numpy=False) - _check_orient(df, "split", dtype=dtype, numpy=False) - _check_orient(df, "index", dtype=dtype, numpy=False) - _check_orient(df, "values", dtype=dtype, numpy=False) - - # basic - _check_all_orients(self.frame) - self.assertEqual(self.frame.to_json(), - self.frame.to_json(orient="columns")) - - _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) - - # big one - # index and columns are strings as all unserialised JSON object keys - # are assumed to be strings - biggie = DataFrame(np.zeros((200, 4)), - columns=[str(i) for i in range(4)], - index=[str(i) for i in range(200)]) - _check_all_orients(biggie) - - # dtypes - _check_all_orients(DataFrame(biggie, dtype=np.float64), - dtype=np.float64) - _check_all_orients(DataFrame(biggie, dtype=np.int64), dtype=np.int64) - _check_all_orients(DataFrame(biggie, dtype=' Date: Fri, 7 Jun 2013 21:55:35 -0400 Subject: [PATCH 06/10] DOC: io.rst doc updates --- doc/source/io.rst | 34 ++++++++++++++++++++++++++++++++++ pandas/io/json.py | 3 ++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 625ff39cd7eba..c98b49be9827f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -949,6 +949,22 @@ Read and write ``JSON`` format files. Writing JSON ~~~~~~~~~~~~ + +A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` +with optional parameters: + +- orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` + + * split : dict like {index -> [index], columns -> [columns], data -> [values]} + * records : list like [{column -> value}, ... , {column -> value}] + * index : dict like {index -> {column -> value}} + * columns : dict like {column -> {index -> value}} + * values : just the values array + +- double_precision : The number of decimal places to use when encoding floating point values, default 10. +- force_ascii : force encoded string to be ASCII, default True. + +Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. .. ipython:: python @@ -959,6 +975,24 @@ Writing JSON Reading JSON ~~~~~~~~~~~~ +Reading a JSON string to pandas object can take a number of parameters. +The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or +is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` + +- json : The JSON string to parse. +- typ : type of object to recover (series or frame), default 'frame' +- orient : The format of the JSON string, one of the following + + * split : dict like {index -> [index], name -> name, data -> [values]} + * records : list like [value, ... , value] + * index : dict like {index -> value} + +- dtype : dtype of the resulting Series +- numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. + +The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is +not parsable. + .. ipython:: python pd.read_json(s) diff --git a/pandas/io/json.py b/pandas/io/json.py index 7c8f6f40bfd4e..76d8ae05b07c0 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -36,6 +36,7 @@ def to_json(obj, orient=None, double_precision=10, ------- result : JSON compatible string """ + if orient is None: if isinstance(obj, Series): orient = 'index' @@ -60,7 +61,7 @@ def read_json(json, typ='frame', orient=None, dtype=None, numpy=True): records : list like [value, ... , value] index : dict like {index -> value} dtype : dtype of the resulting Series - nupmpy: direct decoding to numpy arrays. default True but falls back + numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. Returns From 64220419afd6b711eecf24c9acf300b5d1dd5110 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 8 Jun 2013 09:02:12 -0400 Subject: [PATCH 07/10] API: to_json now writes to a file by default (if None is provided it will return a StringIO object) read_json will read from a string-like or filebuf or url (consistent with other parsers) --- doc/source/io.rst | 23 ++++++-- pandas/core/generic.py | 11 +++- pandas/io/common.py | 1 + pandas/io/excel.py | 2 +- pandas/io/json.py | 41 +++++++++++--- pandas/io/parsers.py | 1 - pandas/io/tests/test_json/test_pandas.py | 70 +++++++++++++++--------- pandas/io/tests/test_json/test_ujson.py | 30 +++++----- 8 files changed, 122 insertions(+), 57 deletions(-) mode change 100644 => 100755 pandas/io/tests/test_json/test_pandas.py diff --git a/doc/source/io.rst b/doc/source/io.rst index c98b49be9827f..f1480b6546e04 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -939,7 +939,6 @@ The Series object also has a ``to_string`` method, but with only the ``buf``, which, if set to ``True``, will additionally output the length of the Series. - JSON ---- @@ -953,6 +952,8 @@ Writing JSON A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` with optional parameters: +- path_or_buf : the pathname or buffer to write the output + This can be ``None`` in which case a ``StringIO`` converted string is returned - orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` * split : dict like {index -> [index], columns -> [columns], data -> [values]} @@ -969,8 +970,8 @@ Note NaN's and None will be converted to null and datetime objects will be conve .. ipython:: python df = DataFrame(randn(10, 2), columns=list('AB')) - s = df.to_json() - s + json = df.to_json(None) + json.getvalue() Reading JSON ~~~~~~~~~~~~ @@ -979,7 +980,11 @@ Reading a JSON string to pandas object can take a number of parameters. The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` -- json : The JSON string to parse. +- filepath_or_buffer : a **VALID** JSON string or file handle / StringIO. The string could be + a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host + is expected. For instance, a local file could be + file ://localhost/path/to/table.json +- json : a VALID JSON string, optional, used if filepath_or_buffer is not provided - typ : type of object to recover (series or frame), default 'frame' - orient : The format of the JSON string, one of the following @@ -993,9 +998,17 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. +Reading from a JSON string + +.. ipython:: python + + pd.read_json(json='{"0":{"0":1,"1":3},"1":{"0":2,"1":4}}') + +Reading from a StringIO + .. ipython:: python - pd.read_json(s) + pd.read_json(json) HTML ---- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7a947f9b4f96b..ac9663a34e748 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -495,7 +495,7 @@ def to_clipboard(self): from pandas.io import clipboard clipboard.to_clipboard(self) - def to_json(self, orient=None, double_precision=10, + def to_json(self, path_or_buf, orient=None, double_precision=10, force_ascii=True): """ Convert the object to a JSON string. @@ -505,6 +505,8 @@ def to_json(self, orient=None, double_precision=10, Parameters ---------- + path_or_buf : the path or buffer to write the result string + if this is None, return a StringIO of the converted string orient : {'split', 'records', 'index', 'columns', 'values'}, default is 'index' for Series, 'columns' for DataFrame @@ -521,10 +523,13 @@ def to_json(self, orient=None, double_precision=10, Returns ------- - result : JSON compatible string + result : a JSON compatible string written to the path_or_buf; + if the path_or_buf is none, return a StringIO of the result + """ + from pandas.io import json - return json.to_json(self, orient=orient, double_precision=double_precision, + return json.to_json(path_or_buf, self, orient=orient, double_precision=double_precision, force_ascii=force_ascii) # install the indexerse diff --git a/pandas/io/common.py b/pandas/io/common.py index 46b47c06f7f5d..353930482c8b8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -2,6 +2,7 @@ import urlparse from pandas.util import py3compat +from StringIO import StringIO _VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc + urlparse.uses_params) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 5b7d13acd99ec..95702847d9c7f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -11,7 +11,7 @@ from pandas.io.parsers import TextParser from pandas.tseries.period import Period -import json +from pandas import json def read_excel(path_or_buf, sheetname, kind=None, **kwds): """Read an Excel table into a pandas DataFrame diff --git a/pandas/io/json.py b/pandas/io/json.py index 76d8ae05b07c0..48412f21fbbdd 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -1,6 +1,8 @@ # pylint: disable-msg=E1101,W0613,W0603 from pandas import Series, DataFrame +from pandas.io.common import get_filepath_or_buffer +from StringIO import StringIO import pandas.json as _json loads = _json.loads @@ -8,16 +10,18 @@ ### interface to/from ### -def to_json(obj, orient=None, double_precision=10, +def to_json(path_or_buf, obj, orient=None, double_precision=10, force_ascii=True): """ - Convert the object to a JSON string. + Convert the object to a JSON string Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. Parameters ---------- + path_or_buf : the pathname or buffer to write the output + if this is None, return a StringIO of the converted string orient : {'split', 'records', 'index', 'columns', 'values'}, default is 'index' for Series, 'columns' for DataFrame @@ -34,7 +38,9 @@ def to_json(obj, orient=None, double_precision=10, Returns ------- - result : JSON compatible string + result : a JSON compatible string written to the path_or_buf; + if the path_or_buf is none, return a StringIO of the result + """ if orient is None: @@ -43,16 +49,27 @@ def to_json(obj, orient=None, double_precision=10, elif isinstance(obj, DataFrame): orient = 'columns' - return dumps(obj, orient=orient, double_precision=double_precision, - ensure_ascii=force_ascii) + s = dumps(obj, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) + if isinstance(path_or_buf, basestring): + with open(path_or_buf,'w') as fh: + fh.write(s) + elif path_or_buf is None: + return StringIO(s) + else: + path_or_buf.write(s) -def read_json(json, typ='frame', orient=None, dtype=None, numpy=True): +def read_json(filepath_or_buffer=None, json=None, typ='frame', orient=None, dtype=None, numpy=True): """ Convert JSON string to pandas object Parameters ---------- - json : The JSON string to parse. + filepath_or_buffer : a VALID JSON StringIO or file handle / StringIO. The string could be + a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host + is expected. For instance, a local file could be + file ://localhost/path/to/table.json + json : a VALID JSON string, optional, used if filepath_or_buffer is not provided typ : type of object to recover (series or frame), default 'frame' orient : {'split', 'records', 'index'}, default 'index' The format of the JSON string @@ -69,6 +86,16 @@ def read_json(json, typ='frame', orient=None, dtype=None, numpy=True): result : Series or DataFrame """ + if json is None: + filepath_or_buffer,_ = get_filepath_or_buffer(filepath_or_buffer) + if isinstance(filepath_or_buffer, basestring): + with open(filepath_or_buffer,'r') as fh: + json = fh.read() + elif hasattr(filepath_or_buffer, 'read'): + json = filepath_or_buffer.read() + else: + json = filepath_or_buffer + obj = None if typ == 'frame': if orient is None: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6e937ba696e39..faf439d87a5f2 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -23,7 +23,6 @@ import pandas.tslib as tslib import pandas.parser as _parser from pandas.tseries.period import Period -import json class DateConversionError(Exception): diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py old mode 100644 new mode 100755 index f4cb7ed03c026..e9bb358763fd8 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -15,8 +15,9 @@ read_json = pd.read_json from pandas.util.testing import (assert_almost_equal, assert_frame_equal, - assert_series_equal) + assert_series_equal, network) import pandas.util.testing as tm +from numpy.testing.decorators import slow _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -56,7 +57,7 @@ def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=True): df = df.sort() - dfjson = df.to_json(orient=orient) + dfjson = df.to_json(None, orient=orient) unser = read_json(dfjson, orient=orient, dtype=dtype, numpy=numpy) unser = unser.sort() @@ -93,8 +94,8 @@ def _check_all_orients(df, dtype=None): # basic _check_all_orients(self.frame) - self.assertEqual(self.frame.to_json(), - self.frame.to_json(orient="columns")) + self.assertEqual(self.frame.to_json(None).read(), + self.frame.to_json(None,orient="columns").read()) _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) @@ -138,61 +139,61 @@ def _check_all_orients(df, dtype=None): _check_orient(df.transpose().transpose(), "index") def test_frame_from_json_bad_data(self): - self.assertRaises(ValueError, read_json, '{"key":b:a:d}') + self.assertRaises(ValueError, read_json, StringIO('{"key":b:a:d}')) # too few indices - json = ('{"columns":["A","B"],' - '"index":["2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') + json = StringIO('{"columns":["A","B"],' + '"index":["2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') self.assertRaises(ValueError, read_json, json, orient="split") # too many columns - json = ('{"columns":["A","B","C"],' - '"index":["1","2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') + json = StringIO('{"columns":["A","B","C"],' + '"index":["1","2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') self.assertRaises(AssertionError, read_json, json, orient="split") # bad key - json = ('{"badkey":["A","B"],' - '"index":["2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') + json = StringIO('{"badkey":["A","B"],' + '"index":["2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') self.assertRaises(TypeError, read_json, json, orient="split") def test_frame_from_json_nones(self): df = DataFrame([[1, 2], [4, 5, 6]]) - unser = read_json(df.to_json()) + unser = read_json(df.to_json(None)) self.assert_(np.isnan(unser['2'][0])) df = DataFrame([['1', '2'], ['4', '5', '6']]) - unser = read_json(df.to_json()) + unser = read_json(df.to_json(None)) self.assert_(unser['2'][0] is None) - unser = read_json(df.to_json(), numpy=False) + unser = read_json(df.to_json(None), numpy=False) self.assert_(unser['2'][0] is None) # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) df[2][0] = np.inf - unser = read_json(df.to_json()) + unser = read_json(df.to_json(None)) self.assert_(np.isnan(unser['2'][0])) df[2][0] = np.NINF - unser = read_json(df.to_json()) + unser = read_json(df.to_json(None)) self.assert_(np.isnan(unser['2'][0])) def test_frame_to_json_except(self): df = DataFrame([1, 2, 3]) - self.assertRaises(ValueError, df.to_json, orient="garbage") + self.assertRaises(ValueError, df.to_json, None, orient="garbage") def test_series_from_json_to_json(self): def _check_orient(series, orient, dtype=None, numpy=True): series = series.sort_index() - unser = read_json(series.to_json(orient=orient), typ='series', + unser = read_json(series.to_json(None,orient=orient), typ='series', orient=orient, numpy=numpy, dtype=dtype) unser = unser.sort_index() if series.index.dtype.type == np.datetime64: @@ -222,8 +223,8 @@ def _check_all_orients(series, dtype=None): # basic _check_all_orients(self.series) - self.assertEqual(self.series.to_json(), - self.series.to_json(orient="index")) + self.assertEqual(self.series.to_json(None).read(), + self.series.to_json(None,orient="index").read()) objSeries = Series([str(d) for d in self.objSeries], index=self.objSeries.index, @@ -239,18 +240,35 @@ def _check_all_orients(series, dtype=None): def test_series_to_json_except(self): s = Series([1, 2, 3]) - self.assertRaises(ValueError, s.to_json, orient="garbage") + self.assertRaises(ValueError, s.to_json, None, orient="garbage") def test_typ(self): s = Series(range(6), index=['a','b','c','d','e','f']) - result = read_json(s.to_json(),typ=None) + result = read_json(s.to_json(None),typ=None) assert_series_equal(result,s) def test_reconstruction_index(self): df = DataFrame([[1, 2, 3], [4, 5, 6]]) - result = read_json(df.to_json()) + result = read_json(df.to_json(None)) # the index is serialized as strings....correct? #assert_frame_equal(result,df) + + @network + @slow + def test_url(self): + import urllib2 + try: + # HTTP(S) + url = 'https://api.github.com/repos/pydata/pandas/issues?per_page=5' + result = read_json(url) + #print result + + url = 'http://search.twitter.com/search.json?q=pandas%20python' + result = read_json(url) + #print result + + except urllib2.URLError: + raise nose.SkipTest diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py index 833abcb32fa98..2e775b4a541ea 100644 --- a/pandas/io/tests/test_json/test_ujson.py +++ b/pandas/io/tests/test_json/test_ujson.py @@ -955,20 +955,22 @@ def testArrayNumpyLabelled(self): self.assertTrue(output[1] is None) self.assertTrue((np.array([u'a']) == output[2]).all()) - input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] - output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) - expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) - self.assertTrue((expectedvals == output[0]).all()) - self.assertTrue(output[1] is None) - self.assertTrue((np.array([u'a', 'b']) == output[2]).all()) - - - input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} - output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) - expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) - self.assertTrue((expectedvals == output[0]).all()) - self.assertTrue((np.array(['1','2','3']) == output[1]).all()) - self.assertTrue((np.array(['a', 'b']) == output[2]).all()) + # py3 is non-determinstic on the ordering...... + if not py3compat.PY3: + input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u'a', 'b']) == output[2]).all()) + + + input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue((np.array(['1','2','3']) == output[1]).all()) + self.assertTrue((np.array(['a', 'b']) == output[2]).all()) class PandasJSONTests(TestCase): From 8e673cf0766b697952522aa593e53bb80b1fbce2 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 9 Jun 2013 01:19:52 -0400 Subject: [PATCH 08/10] ENH: removed json argument, now path_or_buf can be a path,buffer,url,or JSON string added keywords parse_dates,keep_default_dates to allow for date parsing in columns of a Frame (default is False, not to parse dates) --- doc/source/io.rst | 35 ++- pandas/core/generic.py | 4 +- pandas/io/json.py | 295 ++++++++++++++--------- pandas/io/tests/test_json/test_pandas.py | 71 ++++-- 4 files changed, 263 insertions(+), 142 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f1480b6546e04..ee234bc352090 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -953,7 +953,7 @@ A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_ with optional parameters: - path_or_buf : the pathname or buffer to write the output - This can be ``None`` in which case a ``StringIO`` converted string is returned + This can be ``None`` in which case a JSON string is returned - orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` * split : dict like {index -> [index], columns -> [columns], data -> [values]} @@ -969,9 +969,19 @@ Note NaN's and None will be converted to null and datetime objects will be conve .. ipython:: python - df = DataFrame(randn(10, 2), columns=list('AB')) - json = df.to_json(None) - json.getvalue() + dfj = DataFrame(randn(5, 2), columns=list('AB')) + json = dfj.to_json() + json + +Writing to a file, with a date index and a date column + +.. ipython:: python + + dfj2 = dfj.copy() + dfj2['date'] = Timestamp('20130101') + dfj2.index = date_range('20130101',periods=5) + dfj2.to_json('test.json') + open('test.json').read() Reading JSON ~~~~~~~~~~~~ @@ -984,7 +994,6 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json -- json : a VALID JSON string, optional, used if filepath_or_buffer is not provided - typ : type of object to recover (series or frame), default 'frame' - orient : The format of the JSON string, one of the following @@ -992,8 +1001,10 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` * records : list like [value, ... , value] * index : dict like {index -> value} -- dtype : dtype of the resulting Series +- dtype : dtype of the resulting object - numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. +- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True +- keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. @@ -1002,13 +1013,19 @@ Reading from a JSON string .. ipython:: python - pd.read_json(json='{"0":{"0":1,"1":3},"1":{"0":2,"1":4}}') + pd.read_json(json) + +Reading from a file, parsing dates + +.. ipython:: python -Reading from a StringIO + pd.read_json('test.json',parse_dates=True) .. ipython:: python + :suppress: - pd.read_json(json) + import os + os.remove('test.json') HTML ---- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ac9663a34e748..7e6ac4d5bbf68 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -495,7 +495,7 @@ def to_clipboard(self): from pandas.io import clipboard clipboard.to_clipboard(self) - def to_json(self, path_or_buf, orient=None, double_precision=10, + def to_json(self, path_or_buf=None, orient=None, double_precision=10, force_ascii=True): """ Convert the object to a JSON string. @@ -529,7 +529,7 @@ def to_json(self, path_or_buf, orient=None, double_precision=10, """ from pandas.io import json - return json.to_json(path_or_buf, self, orient=orient, double_precision=double_precision, + return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, double_precision=double_precision, force_ascii=force_ascii) # install the indexerse diff --git a/pandas/io/json.py b/pandas/io/json.py index 48412f21fbbdd..446cadf473325 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -1,47 +1,19 @@ # pylint: disable-msg=E1101,W0613,W0603 -from pandas import Series, DataFrame -from pandas.io.common import get_filepath_or_buffer from StringIO import StringIO +import os +from pandas import Series, DataFrame, to_datetime +from pandas.io.common import get_filepath_or_buffer import pandas.json as _json loads = _json.loads dumps = _json.dumps -### interface to/from ### +import numpy as np -def to_json(path_or_buf, obj, orient=None, double_precision=10, - force_ascii=True): - """ - Convert the object to a JSON string - - Note NaN's and None will be converted to null and datetime objects - will be converted to UNIX timestamps. - - Parameters - ---------- - path_or_buf : the pathname or buffer to write the output - if this is None, return a StringIO of the converted string - orient : {'split', 'records', 'index', 'columns', 'values'}, - default is 'index' for Series, 'columns' for DataFrame - - The format of the JSON string - split : dict like - {index -> [index], columns -> [columns], data -> [values]} - records : list like [{column -> value}, ... , {column -> value}] - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array - double_precision : The number of decimal places to use when encoding - floating point values, default 10. - force_ascii : force encoded string to be ASCII, default True. - - Returns - ------- - result : a JSON compatible string written to the path_or_buf; - if the path_or_buf is none, return a StringIO of the result +### interface to/from ### - """ +def to_json(path_or_buf, obj, orient=None, double_precision=10, force_ascii=True): if orient is None: if isinstance(obj, Series): @@ -55,126 +27,229 @@ def to_json(path_or_buf, obj, orient=None, double_precision=10, with open(path_or_buf,'w') as fh: fh.write(s) elif path_or_buf is None: - return StringIO(s) + return s else: path_or_buf.write(s) -def read_json(filepath_or_buffer=None, json=None, typ='frame', orient=None, dtype=None, numpy=True): +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True, + parse_dates=False, keep_default_dates=True): """ Convert JSON string to pandas object Parameters ---------- - filepath_or_buffer : a VALID JSON StringIO or file handle / StringIO. The string could be + filepath_or_buffer : a VALID JSON string or file handle / StringIO. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json - json : a VALID JSON string, optional, used if filepath_or_buffer is not provided - typ : type of object to recover (series or frame), default 'frame' orient : {'split', 'records', 'index'}, default 'index' The format of the JSON string split : dict like {index -> [index], name -> name, data -> [values]} records : list like [value, ... , value] index : dict like {index -> value} - dtype : dtype of the resulting Series + typ : type of object to recover (series or frame), default 'frame' + dtype : dtype of the resulting object numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. + parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns + default is False + keep_default_dates : boolean, default True. If parsing dates, + then parse the default datelike columns Returns ------- result : Series or DataFrame """ - if json is None: - filepath_or_buffer,_ = get_filepath_or_buffer(filepath_or_buffer) - if isinstance(filepath_or_buffer, basestring): - with open(filepath_or_buffer,'r') as fh: - json = fh.read() - elif hasattr(filepath_or_buffer, 'read'): - json = filepath_or_buffer.read() + filepath_or_buffer,_ = get_filepath_or_buffer(path_or_buf) + if isinstance(filepath_or_buffer, basestring): + if os.path.exists(filepath_or_buffer): + with open(filepath_or_buffer,'r') as fh: + json = fh.read() else: - json = filepath_or_buffer + json = filepath_or_buffer + elif hasattr(filepath_or_buffer, 'read'): + json = filepath_or_buffer.read() + else: + json = filepath_or_buffer obj = None if typ == 'frame': - if orient is None: - orient = 'columns' - obj = load_frame(json, orient, dtype, numpy) + obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() if typ == 'series' or obj is None: - if orient == 'columns': - orient = 'index' - obj = load_series(json, orient, dtype, numpy) + obj = SeriesParser(json, orient, dtype, numpy).parse() return obj -def load_series(json, orient, dtype, numpy): - s = None +class Parser(object): + _min_date = 31536000000000000L + + def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_dates=False): + self.json = json - if dtype is not None and orient == "split": - numpy = False + if orient is None: + orient = self._default_orient + + self.orient = orient + self.dtype = dtype - if numpy: - try: - if orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - s = Series(**decoded) - elif orient == "columns" or orient == "index": - s = Series(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - else: - s = Series(loads(json, dtype=dtype, numpy=True)) - except ValueError: + if dtype is not None and orient == "split": numpy = False - if not numpy: - if orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - s = Series(dtype=dtype, **decoded) - else: - s = Series(loads(json), dtype=dtype) - - return s - - -def load_frame(json, orient, dtype, numpy): - """ try to recover a frame, return None if we didn't get anything """ + self.numpy = numpy + self.parse_dates = parse_dates + self.keep_default_dates = keep_default_dates + self.obj = None + + def parse(self): + self._parse() + if self.obj is not None: + self.convert_axes() + if self.parse_dates: + self.try_parse_dates() + return self.obj + + def try_parse_dates(self): + raise NotImplementedError + +class SeriesParser(Parser): + _default_orient = 'index' + + def _parse(self): + + json = self.json + dtype = self.dtype + orient = self.orient + numpy = self.numpy + + if numpy: + try: + if orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = Series(**decoded) + elif orient == "columns" or orient == "index": + self.obj = Series(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + else: + self.obj = Series(loads(json, dtype=dtype, numpy=True)) + except ValueError: + numpy = False + + if not numpy: + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = Series(dtype=dtype, **decoded) + else: + self.obj = Series(loads(json), dtype=dtype) - if dtype is not None and orient == "split": - numpy = False + def convert_axes(self): + """ try to axes if they are datelike """ + if self.obj is None: return - if numpy: try: + self.obj.index = to_datetime(self.obj.index.astype('int64')) + except: + pass + +class FrameParser(Parser): + _default_orient = 'columns' + + def _parse(self): + + json = self.json + dtype = self.dtype + orient = self.orient + numpy = self.numpy + + if numpy: + try: + if orient == "columns": + args = loads(json, dtype=dtype, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + self.obj = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = DataFrame(**decoded) + elif orient == "values": + self.obj = DataFrame(loads(json, dtype=dtype, numpy=True)) + else: + self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + except ValueError: + numpy = False + + if not numpy: if orient == "columns": - args = loads(json, dtype=dtype, numpy=True, labelled=True) - if args: - args = (args[0].T, args[2], args[1]) - df = DataFrame(*args) + self.obj = DataFrame(loads(json), dtype=dtype) elif orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - df = DataFrame(**decoded) - elif orient == "values": - df = DataFrame(loads(json, dtype=dtype, numpy=True)) + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = DataFrame(dtype=dtype, **decoded) + elif orient == "index": + self.obj = DataFrame(loads(json), dtype=dtype).T else: - df = DataFrame(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - except ValueError: - numpy = False + self.obj = DataFrame(loads(json), dtype=dtype) - if not numpy: - if orient == "columns": - df = DataFrame(loads(json), dtype=dtype) - elif orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - df = DataFrame(dtype=dtype, **decoded) - elif orient == "index": - df = DataFrame(loads(json), dtype=dtype).T + def convert_axes(self): + """ try to axes if they are datelike """ + if self.obj is None: return + + if self.orient == 'columns': + axis = 'index' + elif self.orient == 'index': + axis = 'columns' else: - df = DataFrame(loads(json), dtype=dtype) + return + + try: + a = getattr(self.obj,axis).astype('int64') + if (a>self._min_date).all(): + setattr(self.obj,axis,to_datetime(a)) + except: + pass + + def try_parse_dates(self): + """ + try to parse out dates + these are only in in64 columns + """ - return df + if self.obj is None: return + + # our columns to parse + parse_dates = self.parse_dates + if parse_dates is True: + parse_dates = [] + parse_dates = set(parse_dates) + + def is_ok(col, c): + """ return if this col is ok to try for a date parse """ + if not isinstance(col, basestring): return False + + if issubclass(c.dtype.type,np.number) and (c Date: Sun, 9 Jun 2013 10:34:40 -0400 Subject: [PATCH 09/10] ENH: added date_format parm to to_josn to allow epoch or iso formats (which both can be can be parsed with parse_dates=True in read_json) --- doc/source/io.rst | 14 +- pandas/core/generic.py | 10 +- pandas/io/json.py | 192 +++++++++++++++++------ pandas/io/tests/test_json/test_pandas.py | 33 +++- 4 files changed, 194 insertions(+), 55 deletions(-) mode change 100755 => 100644 pandas/io/tests/test_json/test_pandas.py diff --git a/doc/source/io.rst b/doc/source/io.rst index ee234bc352090..e64cbc4bc8101 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -962,10 +962,11 @@ with optional parameters: * columns : dict like {column -> {index -> value}} * values : just the values array +- date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch - double_precision : The number of decimal places to use when encoding floating point values, default 10. - force_ascii : force encoded string to be ASCII, default True. -Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. +Note NaN's and None will be converted to null and datetime objects will be converted based on the date_format parameter .. ipython:: python @@ -973,6 +974,15 @@ Note NaN's and None will be converted to null and datetime objects will be conve json = dfj.to_json() json +Writing in iso date format + +.. ipython:: python + + dfd = DataFrame(randn(5, 2), columns=list('AB')) + dfd['date'] = Timestamp('20130101') + json = dfd.to_json(date_format='iso') + json + Writing to a file, with a date index and a date column .. ipython:: python @@ -1003,7 +1013,7 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` - dtype : dtype of the resulting object - numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. -- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True +- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is False - keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7e6ac4d5bbf68..0d2612d7aed7a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -495,8 +495,8 @@ def to_clipboard(self): from pandas.io import clipboard clipboard.to_clipboard(self) - def to_json(self, path_or_buf=None, orient=None, double_precision=10, - force_ascii=True): + def to_json(self, path_or_buf=None, orient=None, date_format='epoch', + double_precision=10, force_ascii=True): """ Convert the object to a JSON string. @@ -517,6 +517,8 @@ def to_json(self, path_or_buf=None, orient=None, double_precision=10, index : dict like {index -> {column -> value}} columns : dict like {column -> {index -> value}} values : just the values array + date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), + default is epoch double_precision : The number of decimal places to use when encoding floating point values, default 10. force_ascii : force encoded string to be ASCII, default True. @@ -529,8 +531,8 @@ def to_json(self, path_or_buf=None, orient=None, double_precision=10, """ from pandas.io import json - return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, double_precision=double_precision, - force_ascii=force_ascii) + return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, date_format=date_format, + double_precision=double_precision, force_ascii=force_ascii) # install the indexerse for _name, _indexer in indexing.get_indexers_list(): diff --git a/pandas/io/json.py b/pandas/io/json.py index 446cadf473325..17b33931bee5a 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -10,26 +10,107 @@ dumps = _json.dumps import numpy as np +from pandas.tslib import iNaT ### interface to/from ### -def to_json(path_or_buf, obj, orient=None, double_precision=10, force_ascii=True): +def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True): + if isinstance(obj, Series): + s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, + ensure_ascii=force_ascii).write() + elif isinstance(obj, DataFrame): + s = FrameWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, + ensure_ascii=force_ascii).write() + else: + raise NotImplementedError + + if isinstance(path_or_buf, basestring): + with open(path_or_buf,'w') as fh: + fh.write(s) + elif path_or_buf is None: + return s + else: + path_or_buf.write(s) + +class Writer(object): + + def __init__(self, obj, orient, date_format, double_precision, ensure_ascii): + self.obj = obj + if orient is None: - if isinstance(obj, Series): - orient = 'index' - elif isinstance(obj, DataFrame): - orient = 'columns' - - s = dumps(obj, orient=orient, double_precision=double_precision, - ensure_ascii=force_ascii) - if isinstance(path_or_buf, basestring): - with open(path_or_buf,'w') as fh: - fh.write(s) - elif path_or_buf is None: - return s + orient = self._default_orient + + self.orient = orient + self.date_format = date_format + self.double_precision = double_precision + self.ensure_ascii = ensure_ascii + + self.is_copy = False + self._format_axes() + self._format_dates() + + def _format_dates(self): + raise NotImplementedError + + def _format_axes(self): + raise NotImplementedError + + def _needs_to_date(self, data): + return self.date_format == 'iso' and data.dtype == 'datetime64[ns]' + + def _format_to_date(self, data): + if self._needs_to_date(data): + return data.apply(lambda x: x.isoformat()) + return data + + def copy_if_needed(self): + """ copy myself if necessary """ + if not self.is_copy: + self.obj = self.obj.copy() + self.is_copy = True + + def write(self): + return dumps(self.obj, orient=self.orient, double_precision=self.double_precision, ensure_ascii=self.ensure_ascii) + +class SeriesWriter(Writer): + _default_orient = 'index' + + def _format_axes(self): + if self._needs_to_date(self.obj.index): + self.copy_if_needed() + self.obj.index = self._format_to_date(self.obj.index.to_series()) + + def _format_dates(self): + if self._needs_to_date(self.obj): + self.copy_if_needed() + self.obj = self._format_to_date(self.obj) + +class FrameWriter(Writer): + _default_orient = 'columns' + + def _format_axes(self): + """ try to axes if they are datelike """ + if self.orient == 'columns': + axis = 'index' + elif self.orient == 'index': + axis = 'columns' else: - path_or_buf.write(s) + return + + a = getattr(self.obj,axis) + if self._needs_to_date(a): + self.copy_if_needed() + setattr(self.obj,axis,self._format_to_date(a.to_series())) + + def _format_dates(self): + if self.date_format == 'iso': + dtypes = self.obj.dtypes + dtypes = dtypes[dtypes == 'datetime64[ns]'] + if len(dtypes): + self.copy_if_needed() + for c in dtypes.index: + self.obj[c] = self._format_to_date(self.obj[c]) def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True, parse_dates=False, keep_default_dates=True): @@ -79,12 +160,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() if typ == 'series' or obj is None: - obj = SeriesParser(json, orient, dtype, numpy).parse() + obj = SeriesParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() return obj class Parser(object): - _min_date = 31536000000000000L def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_dates=False): self.json = json @@ -106,12 +186,43 @@ def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_d def parse(self): self._parse() if self.obj is not None: - self.convert_axes() + self._convert_axes() if self.parse_dates: - self.try_parse_dates() + self._try_parse_dates() return self.obj - def try_parse_dates(self): + + def _try_parse_to_date(self, data): + """ try to parse a ndarray like into a date column + try to coerce object in epoch/iso formats and + integer/float in epcoh formats """ + + new_data = data + if new_data.dtype == 'object': + try: + new_data = data.astype('int64') + except: + pass + + + # ignore numbers that are out of range + if issubclass(new_data.dtype.type,np.number): + if not ((new_data == iNaT) | (new_data > 31536000000000000L)).all(): + return data + + try: + new_data = to_datetime(new_data) + except: + try: + new_data = to_datetime(new_data.astype('int64')) + except: + + # return old, noting more we can do + new_data = data + + return new_data + + def _try_parse_dates(self): raise NotImplementedError class SeriesParser(Parser): @@ -146,15 +257,19 @@ def _parse(self): else: self.obj = Series(loads(json), dtype=dtype) - def convert_axes(self): + def _convert_axes(self): """ try to axes if they are datelike """ - if self.obj is None: return - try: - self.obj.index = to_datetime(self.obj.index.astype('int64')) + self.obj.index = self._try_parse_to_date(self.obj.index) except: pass + def _try_parse_dates(self): + if self.obj is None: return + + if self.parse_dates: + self.obj = self._try_parse_to_date(self.obj) + class FrameParser(Parser): _default_orient = 'columns' @@ -196,10 +311,8 @@ def _parse(self): else: self.obj = DataFrame(loads(json), dtype=dtype) - def convert_axes(self): + def _convert_axes(self): """ try to axes if they are datelike """ - if self.obj is None: return - if self.orient == 'columns': axis = 'index' elif self.orient == 'index': @@ -208,18 +321,12 @@ def convert_axes(self): return try: - a = getattr(self.obj,axis).astype('int64') - if (a>self._min_date).all(): - setattr(self.obj,axis,to_datetime(a)) + a = getattr(self.obj,axis) + setattr(self.obj,axis,self._try_parse_to_date(a)) except: pass - def try_parse_dates(self): - """ - try to parse out dates - these are only in in64 columns - """ - + def _try_parse_dates(self): if self.obj is None: return # our columns to parse @@ -228,13 +335,10 @@ def try_parse_dates(self): parse_dates = [] parse_dates = set(parse_dates) - def is_ok(col, c): + def is_ok(col): """ return if this col is ok to try for a date parse """ if not isinstance(col, basestring): return False - if issubclass(c.dtype.type,np.number) and (c Date: Tue, 11 Jun 2013 10:01:26 -0400 Subject: [PATCH 10/10] BUG: patch in weird nested decoding issue, courtesy of @Komnomnomnom --- pandas/io/tests/test_json/test_pandas.py | 23 +++++++++++++++++++++++ pandas/src/ujson/python/JSONtoObj.c | 10 ++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 7a639457e51e9..b64bfaacd38f2 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -314,6 +314,29 @@ def test_date_format(self): result = read_json(json,typ='series',parse_dates=True) assert_series_equal(result,ts) + def test_weird_nested_json(self): + + # this used to core dump the parser + s = r'''{ + "status": "success", + "data": { + "posts": [ + { + "id": 1, + "title": "A blog post", + "body": "Some useful content" + }, + { + "id": 2, + "title": "Another blog post", + "body": "More content" + } + ] + } +}''' + + read_json(s) + @network @slow def test_url(self): diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c index 1db7586ad17f7..bc42269d9698b 100644 --- a/pandas/src/ujson/python/JSONtoObj.c +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -10,6 +10,7 @@ typedef struct __PyObjectDecoder JSONObjectDecoder dec; void* npyarr; // Numpy context buffer + void* npyarr_addr; // Ref to npyarr ptr to track DECREF calls npy_intp curdim; // Current array dimension PyArray_Descr* dtype; @@ -67,9 +68,7 @@ void Npy_releaseContext(NpyArrContext* npyarr) } if (npyarr->dec) { - // Don't set to null, used to make sure we don't Py_DECREF npyarr - // in releaseObject - // npyarr->dec->npyarr = NULL; + npyarr->dec->npyarr = NULL; npyarr->dec->curdim = 0; } Py_XDECREF(npyarr->labels[0]); @@ -88,6 +87,7 @@ JSOBJ Object_npyNewArray(void* _decoder) { // start of array - initialise the context buffer npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + decoder->npyarr_addr = npyarr; if (!npyarr) { @@ -515,7 +515,7 @@ JSOBJ Object_newDouble(double value) static void Object_releaseObject(JSOBJ obj, void* _decoder) { PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; - if (obj != decoder->npyarr) + if (obj != decoder->npyarr_addr) { Py_XDECREF( ((PyObject *)obj)); } @@ -555,11 +555,13 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) pyDecoder.dec = dec; pyDecoder.curdim = 0; pyDecoder.npyarr = NULL; + pyDecoder.npyarr_addr = NULL; decoder = (JSONObjectDecoder*) &pyDecoder; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO&", kwlist, &sarg, &numpy, &labelled, PyArray_DescrConverter2, &dtype)) { + Npy_releaseContext(pyDecoder.npyarr); return NULL; }