Skip to content

Commit

Permalink
Add a queryset based as_of support.
Browse files Browse the repository at this point in the history
This changes `as_of` to return a QuerySet instead of a list of instances
so that further filtering can be applied to the history.
  • Loading branch information
jeking3 committed Jan 24, 2022
1 parent 9d4108c commit 78b0ab3
Show file tree
Hide file tree
Showing 5 changed files with 265 additions and 57 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Upgrade Implications:

Full list of changes:

- Added queryset-based filtering with ``as_of`` (gh-397)
- Added index on `history_date` column; opt-out with setting `SIMPLE_HISTORY_DATE_INDEX` (gh-565)
- Added ``excluded_field_kwargs`` to support custom ``OneToOneField`` that have
additional arguments that don't exist on ``ForeignKey``. (gh-870)
Expand Down
42 changes: 35 additions & 7 deletions docs/querying_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,16 +98,44 @@ This will change the ``poll`` instance to have the data from the
as_of
-----

This method will return an instance of the model as it would have existed at
the provided date and time.
The HistoryManager allows you to query a point in time for the latest historical
records or instances. When called on an instance's history manager, the ``as_of``
method will return the instance from the specified point in time, if the instance
existed at that time, or raise DoesNotExist. When called on a model's history
manager, the ``as_of`` method will return instances from a specific date and time
that you specify, returning a queryset that you can use to further filter the result.

.. code-block:: pycon
>>> from datetime import datetime
>>> poll.history.as_of(datetime(2010, 10, 25, 18, 4, 0))
<Poll: Poll object as of 2010-10-25 18:03:29.855689>
>>> poll.history.as_of(datetime(2010, 10, 25, 18, 5, 0))
<Poll: Poll object as of 2010-10-25 18:04:13.814128>
>>> t0 = datetime.now()
>>> document1 = RankedDocument.objects.create(rank=42)
>>> document2 = RankedDocument.objects.create(rank=84)
>>> t1 = datetime.now()
>>> RankedDocument.history.as_of(t1)
<HistoricalQuerySet [
<RankedDocument: RankedDocument object (1)>,
<RankedDocument: RankedDocument object (2)>
]>
>>> RankedDocument.history.as_of(t1).filter(rank__lte=50)
<HistoricalQuerySet [
<RankedDocument: RankedDocument object (1)>
]>
``as_of`` is a convenience: the following two queries are identical.

.. code-block:: pycon
RankedDocument.history.as_of(t1)
RankedDocument.history.filter(history_date__lte=t1).latest_of_each().as_instances()
If you filter by `pk` the behavior depends on whether the queryset is
returning instances or historical records. When the queryset is returning
instances, `pk` is mapped to the original model's primary key field.
When the queryset is returning historical records, `pk` refers to the
`history_id` primary key.


most_recent
-----------
Expand Down
181 changes: 134 additions & 47 deletions simple_history/manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from django.conf import settings
from django.db import connection, models
from django.db.models import OuterRef, Subquery
from django.db.models import OuterRef, QuerySet, Subquery
from django.utils import timezone

from simple_history.utils import (
Expand All @@ -9,14 +9,113 @@
)


class HistoricalQuerySet(QuerySet):
"""
Enables additional functionality when working with historical records.
For additional history on this topic, see:
- https://github.com/jazzband/django-simple-history/pull/229
- https://github.com/jazzband/django-simple-history/issues/354
- https://github.com/jazzband/django-simple-history/issues/397
"""

def __init__(self, *args, **kwargs):
super(HistoricalQuerySet, self).__init__(*args, **kwargs)
self._as_instances = False
self._pk_attr = self.model.instance_type._meta.pk.attname

def as_instances(self):
"""
Return a queryset that generates instances instead of historical records.
Queries against the resulting queryset will translate `pk` into the
primary key field of the original type.
Returns a queryset.
"""
if not self._as_instances:
result = self.exclude(history_type="-")
result._as_instances = True
else:
result = self._clone()
return result

def filter(self, *args, **kwargs):
"""
If a `pk` filter arrives and the queryset is returning instances
then the caller actually wants to filter based on the original
type's primary key, and not the history_id (historical record's
primary key); this happens frequently with DRF.
"""
if self._as_instances and "pk" in kwargs:
kwargs[self._pk_attr] = kwargs.pop("pk")
return super().filter(*args, **kwargs)

def latest_of_each(self):
"""
Ensures results in the queryset are the latest historical record for each
primary key. Deletions are not removed.
Returns a queryset.
"""
# If using MySQL, need to get a list of IDs in memory and then use them for the
# second query.
# Does mean two loops through the DB to get the full set, but still a speed
# improvement.
backend = connection.vendor
if backend == "mysql":
history_ids = {}
for item in self.order_by("-history_date", "-pk"):
if getattr(item, self._pk_attr) not in history_ids:
history_ids[getattr(item, self._pk_attr)] = item.pk
latest_historics = self.filter(history_id__in=history_ids.values())
elif backend == "postgresql":
latest_pk_attr_historic_ids = (
self.order_by(self._pk_attr, "-history_date", "-pk")
.distinct(self._pk_attr)
.values_list("pk", flat=True)
)
latest_historics = self.filter(history_id__in=latest_pk_attr_historic_ids)
else:
latest_pk_attr_historic_ids = (
self.filter(**{self._pk_attr: OuterRef(self._pk_attr)})
.order_by("-history_date", "-pk")
.values("pk")[:1]
)
latest_historics = self.filter(
history_id__in=Subquery(latest_pk_attr_historic_ids)
)
return latest_historics

def _clone(self):
c = super()._clone()
c._as_instances = self._as_instances
c._pk_attr = self._pk_attr
return c

def _fetch_all(self):
super()._fetch_all()
self._instanceize()

def _instanceize(self):
"""
Convert the result cache to instances if possible and it has not already been
done. If a query extracts `.values(...)` then the result cache will not contain
historical objects to be converted.
"""
if (
self._result_cache
and self._as_instances
and isinstance(self._result_cache[0], self.model)
):
self._result_cache = [item.instance for item in self._result_cache]


class HistoryDescriptor:
def __init__(self, model):
self.model = model

def __get__(self, instance, owner):
if instance is None:
return HistoryManager(self.model)
return HistoryManager(self.model, instance)
return HistoryManager.from_queryset(HistoricalQuerySet)(self.model, instance)


class HistoryManager(models.Manager):
Expand Down Expand Up @@ -66,16 +165,41 @@ def most_recent(self):
return self.instance.__class__(**values)

def as_of(self, date):
"""Get a snapshot as of a specific date.
"""
Get a snapshot as of a specific date.
When this is used on an instance, it will return the instance based
on the specific date. If the instance did not exist yet, or had been
deleted, then a DoesNotExist error is railed.
When this is used on a model's history manager, the resulting queryset
will locate the most recent historical record before the specified date
for each primary key, generating instances. If the most recent historical
record is a deletion, that instance is dropped from the result.
A common usage pattern for querying is to accept an optional time
point `date` and then use:
`qs = <Model>.history.as_of(date) if date else <Model>.objects`
Returns an instance, or an iterable of the instances, of the
original model with all the attributes set according to what
was present on the object on the date provided.
after which point one can add filters, values - anything a normal
queryset would support.
To retrieve historical records, query the model's history directly;
for example:
`qs = <Model>.history.filter(history_date__lte=date, pk=...)`
To retrieve the most recent historical record, including deletions,
you could then use:
`qs = qs.latest_of_each()`
"""
if not self.instance:
return self._as_of_set(date)
queryset = self.get_queryset().filter(history_date__lte=date)
if not self.instance:
queryset = queryset.latest_of_each().as_instances()
return queryset

try:
# historical records are sorted in reverse chronological order
history_obj = queryset[0]
except IndexError:
raise self.instance.DoesNotExist(
Expand All @@ -87,43 +211,6 @@ def as_of(self, date):
)
return history_obj.instance

def _as_of_set(self, date):
model = type(self.model().instance) # a bit of a hack to get the model
pk_attr = model._meta.pk.name
queryset = self.get_queryset().filter(history_date__lte=date)
# If using MySQL, need to get a list of IDs in memory and then use them for the
# second query.
# Does mean two loops through the DB to get the full set, but still a speed
# improvement.
backend = connection.vendor
if backend == "mysql":
history_ids = {}
for item in queryset.order_by("-history_date", "-pk"):
if getattr(item, pk_attr) not in history_ids:
history_ids[getattr(item, pk_attr)] = item.pk
latest_historics = queryset.filter(history_id__in=history_ids.values())
elif backend == "postgresql":
latest_pk_attr_historic_ids = (
queryset.order_by(pk_attr, "-history_date", "-pk")
.distinct(pk_attr)
.values_list("pk", flat=True)
)
latest_historics = queryset.filter(
history_id__in=latest_pk_attr_historic_ids
)
else:
latest_pk_attr_historic_ids = (
queryset.filter(**{pk_attr: OuterRef(pk_attr)})
.order_by("-history_date", "-pk")
.values("pk")[:1]
)
latest_historics = queryset.filter(
history_id__in=Subquery(latest_pk_attr_historic_ids)
)
adjusted = latest_historics.exclude(history_type="-").order_by(pk_attr)
for historic_item in adjusted:
yield historic_item.instance

def bulk_history_create(
self,
objs,
Expand Down
7 changes: 7 additions & 0 deletions simple_history/tests/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ class Document(models.Model):
changed_by = models.ForeignKey(
User, on_delete=models.CASCADE, null=True, blank=True
)

history = HistoricalRecords()

@property
Expand All @@ -269,6 +270,12 @@ def _history_user(self, value):
self.changed_by = value


class RankedDocument(Document):
rank = models.IntegerField(default=50)

history = HistoricalRecords()


class Profile(User):
date_of_birth = models.DateField()

Expand Down
Loading

0 comments on commit 78b0ab3

Please sign in to comment.