karlicoss · karlicoss · Sep 12, 2023 · Sep 12, 2023 · Sep 12, 2023 · Sep 12, 2023
diff --git a/benchmarks/20230912-comparison-with-legacy.org b/benchmarks/20230912-comparison-with-legacy.org
@@ -0,0 +1,81 @@
+Running on @karlicoss desktop PC, =python3.10=.
+
+This is basically to justify switching to the new serialization method
+
+- old way, =legacy= used to 'flatten' the type into an sqlite row
+- new way, =cachew=, just dumps it as a dict, then to bytes via =orjson= and stores in a single sqlite column
+
+The numbers between legacy and cachew can't be directly compared though.
+Legacy =serializing= step emits a tuple, which can be inserted directly into the database.
+So to compare it with the new way, we need to compare with the sum of =serializing= + =json dump=.
+That said this won't be exact comparison either, since legacy binder relied on sqlalchemy to dump custom types to sqlite types (e.g. =datetime= or =Exception=). So legacy will have a slight advantage this way, but it's fine.
+
+So we can see that for:
+- =test_union_str_dataclass=
+  - new implementation: =0.53 + 0.45s= to serialize; =0.29 + 0.48= to deserialize
+  - old implementation: =2.38s= to serialize; =1.92= to deserialize
+- =test_nested_dataclass=
+  - new implementation: =1.05 + 0.26s= to serialize; =0.50 + 1.42= to deserialize
+  - old implementation: =1.92s= to serialize; =1.88= to deserialize
+
+For both tests, serialization if quite a bit faster with the new implementation.
+On the second test, they are on par for deserialization, but as I mention these numbers are in favor of the legacy implementation.
+
+In addition, keeping everything in one column unlocks some othe optimizations which wouldn't be possible with multiple columns.
+
+
+#+begin_example
+$ pytest --pyargs cachew.tests.marshall -k 'gc_off and 1000000 and not cattrs' -s
+=========================================================== test session starts ============================================================
+platform linux -- Python 3.10.12, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3
+cachedir: .pytest_cache
+rootdir: /code/cachew_jsonpickle
+configfile: pytest.ini
+plugins: anyio-3.6.2
+collected 100 items / 95 deselected / 5 selected
+
+src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
+building      1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.34s
+serializing   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.53s
+json dump     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s
+sqlite dump   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.08s
+sqlite load   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s
+jsonl dump    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.18s
+jsonl load    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.13s
+json load     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s
+deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.48s
+PASSED
+src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-legacy]
+building      1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.35s
+serializing   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 2.38s
+json dump     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.22s
+sqlite dump   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.06s
+sqlite load   1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s
+jsonl dump    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s
+jsonl load    1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s
+json load     1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.23s
+deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.92s
+PASSED
+src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-cachew]
+building      1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.58s
+serializing   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.05s
+json dump     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.26s
+sqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.03s
+sqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.30s
+jsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.14s
+jsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.14s
+json load     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.50s
+deserializing 1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.42s
+PASSED
+src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-legacy]
+building      1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.56s
+serializing   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.92s
+json dump     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.21s
+sqlite dump   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.99s
+sqlite load   1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.29s
+jsonl dump    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.12s
+jsonl load    1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.12s
+json load     1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 0.24s
+deserializing 1000000 objects of type <class 'cachew.tests.marshall.test_nested_dataclass.<locals>.TE2'>: 1.88s
+PASSED
+#+end_example
diff --git a/setup.py b/setup.py
@@ -9,6 +9,7 @@
 install_requires = [
     'appdirs'        ,  # default cache dir
     'sqlalchemy>=1.0',  # cache DB interaction
+    'orjson',           # fast json serialization
 ]
 
 
@@ -49,7 +50,6 @@ def main() -> None:
 
                 'enlighten',  # used in logging helper, but not really required
 
-                'orjson',  # for now test only but may actually use soon
                 'cattrs',  # benchmarking alternative marshalling implementation
 
                 'pyinstrument',  # for profiling from within tests

diff --git a/src/cachew/__init__.py b/src/cachew/__init__.py
@@ -14,6 +14,16 @@
 import dataclasses
 import warnings
 
+try:
+    # orjson might not be available on some architectures, so let's make it defensive just in case
+    from orjson import loads as orjson_loads, dumps as orjson_dumps  # pylint: disable=no-name-in-module
+except:
+    warnings.warn("orjson couldn't be imported. It's _highly_ recommended for better caching performance")
+    def orjson_dumps(*args, **kwargs):  # type: ignore[misc]
+        # sqlite needs a blob
+        return json.dumps(*args, **kwargs).encode('utf8')
+
+    orjson_loads = json.loads
 
 import appdirs
 
@@ -27,8 +37,8 @@
 except Exception as e:
     logging.exception(e)
 
-from .legacy import NTBinder
 from .logging_helper import makeLogger
+from .marshall.cachew import CachewMarshall
 from .utils import (
     is_primitive,
     is_union,
@@ -142,11 +152,10 @@ def do_begin(conn):
         self.meta = sqlalchemy.MetaData()
         self.table_hash = Table('hash', self.meta, Column('value', sqlalchemy.String))
 
-        self.binder = NTBinder.make(tp=cls)
         # actual cache
-        self.table_cache     = Table('cache'    , self.meta, *self.binder.columns)
+        self.table_cache     = Table('cache'    , self.meta, Column('data', sqlalchemy.BLOB))
         # temporary table, we use it to insert and then (atomically?) rename to the above table at the very end
-        self.table_cache_tmp = Table('cache_tmp', self.meta, *self.binder.columns)
+        self.table_cache_tmp = Table('cache_tmp', self.meta, Column('data', sqlalchemy.BLOB))
 
     def __enter__(self) -> 'DbHelper':
         return self
@@ -463,8 +472,7 @@ def composite_hash(self, *args, **kwargs) -> Dict[str, Any]:
             if k in hsig.parameters or 'kwargs' in hsig.parameters
         }
         kwargs = {**defaults, **kwargs}
-        binder = NTBinder.make(tp=self.cls_)
-        schema = str(binder.columns) # todo not super nice, but works fine for now
+        schema = str(self.cls_)
         hash_parts = {
             'cachew'      : CACHEW_VERSION,
             'schema'      : schema,
@@ -547,7 +555,7 @@ def cachew_wrapper(
              db.connection.begin():
             # NOTE: deferred transaction
             conn = db.connection
-            binder = db.binder
+            marshall = CachewMarshall(Type_=cls)
             table_cache     = db.table_cache
             table_cache_tmp = db.table_cache_tmp
 
@@ -579,7 +587,9 @@ def cachew_wrapper(
             def cached_items():
                 rows = conn.execute(table_cache.select())
                 for row in rows:
-                    yield binder.from_row(row)
+                    j = orjson_loads(row[0])
+                    obj = marshall.load(j)
+                    yield obj
 
             if new_hash == old_hash:
                 logger.debug('hash matched: loading from cache')
@@ -683,8 +693,7 @@ def missing_keys(cached: List[str], wanted: List[str]) -> Optional[List[str]]:
             def flush() -> None:
                 nonlocal chunk
                 if len(chunk) > 0:
-                    # TODO hmm, it really doesn't work unless you zip into a dict first
-                    # maybe should return dicts from binder instead then?
+                    # TODO optimize this, we really don't need to make extra dicts here just to insert
                     chunk_dict = [
                         dict(zip(column_names, row))
                         for row in chunk
@@ -693,15 +702,17 @@ def flush() -> None:
                     chunk = []
 
             total_objects = 0
-            for d in datas:
+            for obj in datas:
                 try:
                     total_objects += 1
-                    yield d
+                    yield obj
                 except GeneratorExit:
                     early_exit = True
                     return
-
-                chunk.append(binder.to_row(d))
+
+                dct = marshall.dump(obj)
+                j = orjson_dumps(dct)
+                chunk.append((j,))
                 if len(chunk) >= chunk_by:
                     flush()
             flush()
@@ -732,6 +743,8 @@ def flush() -> None:
         yield from func(*args, **kwargs)
 
 
+from .legacy import NTBinder
+
 __all__ = [
     'cachew',
     'CachewException',

diff --git a/src/cachew/marshall/cachew.py b/src/cachew/marshall/cachew.py
@@ -3,7 +3,7 @@
 from abc import abstractmethod
 from collections import abc
 from dataclasses import dataclass, is_dataclass
-from datetime import datetime, timezone
+from datetime import date, datetime, timezone
 import sys
 import types
 from typing import (
@@ -28,6 +28,7 @@
     Json,
     T,
 )
+from ..utils import CachewException
 
 
 class CachewMarshall(AbstractMarshall[T]):
@@ -206,8 +207,14 @@ def load(self, dct):
 def _exc_helper(args):
     for a in args:
         at = type(a)
-        assert at in JTypes, (a, at)
-        yield a
+        if at in JTypes:
+            yield a
+        elif issubclass(at, date):
+            # TODO would be nice to restore datetime from cache too
+            # maybe generally save exception as a union? or intact and let orjson save it?
+            yield a.isoformat()
+        else:
+            yield str(a)  # not much we can do..
 
 
 @dataclass(**SLOTS)
@@ -245,6 +252,15 @@ def load(self, dct: tuple):
         return dt.astimezone(tz)
 
 
+@dataclass(**SLOTS)
+class XDate(Schema):
+    def dump(self, obj: date) -> Json:
+        return obj.isoformat()
+
+    def load(self, dct: str):
+        return date.fromisoformat(dct)
+
+
 ident = lambda x: x
 
 
@@ -299,7 +315,13 @@ def build_schema(Type) -> Schema:
         if issubclass(Type, datetime):
             return XDatetime(type=Type)
 
-        assert is_dataclass(Type) or is_namedtuple(Type)
+        if issubclass(Type, date):
+            return XDate(type=Type)
+
+        if not (is_dataclass(Type) or is_namedtuple(Type)):
+            raise CachewException(
+                f"{Type} doesn't look like a supported type to cache. See https://github.com/karlicoss/cachew#features for the list of supported types."
+            )
         hints = get_type_hints(Type)
         fields = tuple((k, build_schema(t)) for k, t in hints.items())
         return SDataclass(
@@ -413,7 +435,7 @@ def test_serialize_and_deserialize() -> None:
     # optionals
     helper('aaa', Optional[str])
     helper('aaa', Union[str, None])
-    helper(None , Union[str, None])
+    helper(None, Union[str, None])
 
     # lists
     helper([1, 2, 3], List[int])
@@ -467,6 +489,10 @@ class WithJson:
         RuntimeError('more stuff'),
         RuntimeError(),
     ], List[Union[RuntimeError, Point]])
+
+    exc_with_datetime     = Exception('I happenned on', datetime.fromisoformat('2021-04-03T10:11:12'))
+    exc_with_datetime_exp = Exception('I happenned on', '2021-04-03T10:11:12')
+    helper(exc_with_datetime, Exception, expected=exc_with_datetime_exp)
     # fmt: on
 
     # datetimes
@@ -485,7 +511,6 @@ class WithJson:
         dwinter,
         dsummer,
         dsummer.replace(tzinfo=timezone.utc),
-        # TODO date class as well?
     ]
     for d in dates:
         jj, dd = helper(d, datetime)
@@ -498,5 +523,7 @@ class WithJson:
     assert helper(dsummer_tz, datetime)[0] == ('2020-08-03T01:02:03+01:00', 'Europe/London')
     assert helper(dwinter, datetime)[0] == ('2020-02-03T01:02:03', None)
 
+    assert helper(dwinter.date(), date)[0] == '2020-02-03'
+
 
 # TODO test type aliases and such??