shipwell · MaxwellPayne · Sep 25, 2023 · Sep 19, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -5,6 +5,7 @@ RUN apk add --no-cache ca-certificates tzdata && update-ca-certificates
 
 # Install the required packages
 RUN pip install --no-cache-dir redis flower
+RUN pip install --no-cache-dir "SQLAlchemy>=1.4,<2" "psycopg2-binary>=2.9,<3"
 
 # PYTHONUNBUFFERED: Force stdin, stdout and stderr to be totally unbuffered. (equivalent to `python -u`)
 # PYTHONHASHSEED: Enable hash randomization (equivalent to `python -R`)

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -4,6 +4,14 @@ services:
     image: redis:alpine
     ports:
       - 6379:6379
+  postgres:
+    image: postgres:14-alpine
+    environment:
+      POSTGRES_DB: postgres
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+    ports:
+      - 5432:5432
   prometheus:
     image: prom/prometheus
     volumes:

diff --git a/flower/utils/results/result.py b/flower/utils/results/result.py
@@ -2,7 +2,7 @@
 import heapq
 from collections.abc import Iterator, Iterable
 from functools import total_ordering
-from typing import Any, Sequence, TypeAlias
+from typing import Any, TypeAlias
 
 import dateutil.parser
 import kombu.clocks
@@ -14,12 +14,12 @@ def __init__(
         *,
         task_id: str,
         status: str,
-        date_done: str,
+        date_done: str | datetime.datetime,
         result: Any,
         traceback: Any,
         # fields with default values may be null when Celery's `result_extended=False`
-        args: Sequence[Any] | None = None,
-        kwargs: dict[str, Any] | None = None,
+        args: Any = None,
+        kwargs: Any = None,
         name: str | None = None,
         # add graceful handling for extra fields that may have been persisted to the result backend record
         **kw: Any,
@@ -30,7 +30,9 @@ def __init__(
         """
         self.task_id = task_id
         self.status = status
-        self.date_done: datetime.datetime = dateutil.parser.parse(date_done)
+        self.date_done: datetime.datetime = (
+            dateutil.parser.parse(date_done) if isinstance(date_done, str) else date_done
+        )
         self.result = result
         self.traceback = traceback
         self.name = name
@@ -56,8 +58,8 @@ def to_render_dict(self) -> dict[str, Any]:
             "task_id": self.task_id,
             "status": self.status,
             "date_done": self.date_done.timestamp(),
-            "args": repr(self.args),
-            "kwargs": repr(self.kwargs),
+            "args": self.args if isinstance(self.args, str) else repr(self.args),
+            "kwargs": self.kwargs if isinstance(self.kwargs, str) else repr(self.kwargs),
             "result": repr(self.result),
             "traceback": str(self.traceback),
         }

diff --git a/flower/utils/results/stores/database.py b/flower/utils/results/stores/database.py
@@ -1,13 +1,108 @@
+import json
+import pickle
 from collections.abc import Iterator
+from typing import Any
 
-from celery.backends.database import DatabaseBackend
+import kombu.serialization
+import sqlalchemy as sa
+from celery.backends.database import DatabaseBackend, TaskExtended
+from sqlalchemy.orm.session import Session
 
-from flower.utils.results.result import ResultIdWithResultPair
+from flower.utils.results.result import ResultIdWithResultPair, Result
 from flower.utils.results.stores import AbstractBackendResultsStore
 
 
 class DatabaseBackendResultsStore(AbstractBackendResultsStore[DatabaseBackend]):
+    """
+    Results store capable of reading from Celery's supported `DatabaseBackend`, which uses SQLAlchemy models to persist
+    tasks in a SQL database.
+    """
     def results_by_timestamp(self, limit: int | None = None, reverse: bool = True) -> Iterator[
         ResultIdWithResultPair
     ]:
-        raise NotImplementedError()
+        query_limit = self.max_tasks_in_memory
+        if limit is not None and limit < query_limit:
+            query_limit = limit
+
+        session: Session
+        with self.backend.ResultSession() as session:
+            ordering = TaskExtended.date_done.desc() if reverse else TaskExtended.date_done.asc()
+            task_select_query = sa.select(TaskExtended).order_by(ordering).limit(query_limit)
+            for task in session.execute(task_select_query).scalars():
+                result = self._map_task_to_result(task)
+                yield result.task_id, result
+
+    def _map_task_to_result(self, task: TaskExtended) -> Result:
+        """
+        Convert a `TaskExtended` ORM object into our shared `Result` data structure. This class assumes the usage of
+        `TaskExtended` in order to query the "taskmeta" table, since `TaskExtended` queries can successfully return
+        full data for both tasks that were saved with `result_extended=True` and those that were saved with
+        `result_extended=False`.
+
+        Because we want to support both extended and non-extended tasks, we need a way to figure out whether the
+        provided task was _actually_ extended or not at the time it was saved. We can do this by looking at the `name`
+        field. When a task is saved under `result_extended=True`, then it will have a name referencing the name of the
+        function. Otherwise, that field will be null and we know that it was `result_extended=False`.
+        """
+        is_actually_extended: bool = task.name is not None
+        if is_actually_extended:
+            return Result(
+                task_id=task.task_id,
+                status=task.status,
+                date_done=task.date_done,
+                result=task.result,
+                traceback=task.traceback,
+                args=self.deserialize_binary_column_value(task.args),
+                kwargs=self.deserialize_binary_column_value(task.kwargs),
+                name=task.name,
+            )
+
+        return Result(
+            task_id=task.task_id,
+            status=task.status,
+            date_done=task.date_done,
+            result=task.result,
+            traceback=task.traceback,
+        )
+
+    def deserialize_binary_column_value(self, value: bytes) -> Any:
+        """
+        Attempt to deserialize the provided `value` using the available serialization decoders. Celery stores task
+        `args` and `kwargs` in binary columns, but the proper decoding mechanism for those binary columns is not
+        immediately obvious. These fields get serialized bsed on whatever the value of `Celery.conf.result_serializer`
+        is at the time the task result is saved. However, it's possible that the value of that config setting changed
+        across different Celery processes, and therefore we may be dealing with a database that has co-mingled records
+        from different serializers. Unfortunately, there is no column in the database schema that records which
+        serializer was used for each task.
+
+        To work around this limitation, this method takes guesses at the serialization of `value` based on whatever
+        serializers are available in the active `result_accept_content` or `accept_content` Celery config setting.
+        Each serializer will attempt deserialization, and if one succeeds, we return the deserialized value immediately.
+        If all deserialization attempts fail, we will gracefully return the original bytes value with a prefix message
+        explaining that deserialization failed.
+
+        TODO: currently this method only attempts deserialization with JSON and pickle. We should support more built-in
+          content types, and potentially allow for deserialization using custom encodings. We chose to limit the
+          supported serializers here because JSON and pickle are the only serialization mechanisms available without
+          additional dependencies (e.g. 'yaml' requires the inclusion of the third-party `yaml` library).
+        """
+        celery_result_accept_content: list[str] = (
+            self.backend.app.conf.result_accept_content
+            or self.backend.app.conf.accept_content
+        )
+        accept_content_types: list[str] = [item.lower() for item in celery_result_accept_content]
+
+        if 'json' in accept_content_types or 'application/json' in accept_content_types:
+            try:
+                return kombu.serialization.registry._decoders['application/json'](value)
+            except (json.JSONDecodeError, UnicodeDecodeError):
+                pass
+
+        if 'pickle' in accept_content_types or 'application/x-python-serialize' in accept_content_types:
+            try:
+                return kombu.serialization.registry._decoders['application/x-python-serialize'](value)
+            except pickle.UnpicklingError:
+                pass
+
+        # couldn't deserialize; just fall back to an error message plus the `repr()` of the original byte string
+        return 'Failed to deserialize binary value: ' + repr(value)
diff --git a/flower/utils/results/stores/redis.py b/flower/utils/results/stores/redis.py
@@ -1,4 +1,3 @@
-import json
 from collections.abc import Iterator
 from typing import Any
 
@@ -23,7 +22,7 @@ def results_by_timestamp(self, limit: int | None = None, reverse: bool = True) -
         for key in self.backend.client.scan_iter(
             match=task_key_prefix + ("*" if isinstance(task_key_prefix, str) else b"*")
         ):
-            result_data: dict[str, Any] = json.loads(self.backend.client.get(key))
+            result_data: dict[str, Any] = self.backend.decode_result(self.backend.client.get(key))
             result = Result(**result_data)
             heap.push(result)
 

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -1,4 +1,6 @@
 -r default.txt
 -r test.txt
 redis>=4.3.6
+SQLAlchemy>=1.4,<2
+psycopg2-binary>=2.9,<3
 pylint
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
@@ -2,10 +2,13 @@
 import unittest
 from distutils.util import strtobool
 from unittest.mock import patch
-from urllib.parse import urlencode
+from urllib.parse import urlencode, urlparse, urlunparse
 
-import celery
+import celery.backends.database.session
+import sqlalchemy.schema
 import tornado.testing
+from celery.backends.database import DatabaseBackend
+from celery.exceptions import ImproperlyConfigured
 from tornado.ioloop import IOLoop
 from tornado.options import options
 
@@ -60,3 +63,66 @@ def setUpClass(cls):
                 '__unittest_skip_why__',
                 f'Skipping this test case due to the "{skip_backend_tests_env_var}" being true',
             )
+
+
+class DatabaseBackendDependentTestCase(BackendDependentTestCase):
+    """
+    Extension of `BackendDependentTestCase` that sets a default value for `self.app.conf.database_url` based on the
+    `TEST_DATABASE_CELERY_RESULT_BACKEND_CONNECTION_STRING` environment variable. If no such environment variable
+    exists, the setup will assume a localhost connection to Postgres.
+    """
+
+    test_schema_name = 'test_flower'
+    """
+    Name of the DB schema within which we should run tests. This should be separate from the main database schema so
+    we are safe to create/destroy records at-will throughout the testing lifecycle.
+    """
+
+    def setUp(self):
+        super().setUp()
+        if hasattr(self, 'app'):
+            if not isinstance(self.app, celery.Celery):
+                raise ImproperlyConfigured(
+                    'If `self.app` is initialized by another class setUp, it must be an instance of Celery'
+                )
+        else:
+            self.app = celery.Celery()
+
+        database_url_parsed = urlparse(
+            os.environ.get(
+                'TEST_DATABASE_CELERY_RESULT_BACKEND_CONNECTION_STRING',
+                'postgresql://postgres:postgres@localhost:5432',
+            )
+        )
+        if '+' in database_url_parsed.scheme:
+            raise ImproperlyConfigured(
+                'Should exclude the "+" from Celery database_url scheme and instead only supply the database protocol'
+            )
+        self.app.conf.database_url = urlunparse(database_url_parsed)
+
+        # restrict creation/deletion of DB models to a separate schema
+        self.app.conf.database_table_schemas = {
+            'task': self.test_schema_name,
+            'group': self.test_schema_name,
+        }
+
+        self.backend = DatabaseBackend(app=self.app)
+        self._ensure_test_schema()
+
+    def _ensure_test_schema(self) -> None:
+        """
+        Create a short-lived session that executes a CREATE SCHEMA statement if the test schema does not yet exist
+        in the database.
+        """
+        test_schema_name = self.test_schema_name
+
+        class CreateSchemaSessionManager(celery.backends.database.session.SessionManager):
+            def prepare_models(self, engine):
+                with engine.connect() as conn:
+                    if not conn.dialect.has_schema(conn, test_schema_name):
+                        conn.execute(sqlalchemy.schema.CreateSchema(test_schema_name))
+                return super().prepare_models(engine)
+
+        with self.backend.ResultSession(session_manager=CreateSchemaSessionManager()):
+            # invoking the context manager will invoke the `prepare_models()` that ensures a schema
+            pass