diff --git a/CHANGES.rst b/CHANGES.rst index 5fb9a803e..878bec749 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -10,6 +10,7 @@ Upgrade Implications: Full list of changes: +- Added queryset-based filtering with ``as_of`` (gh-397) - Added index on `history_date` column; opt-out with setting `SIMPLE_HISTORY_DATE_INDEX` (gh-565) - Added ``excluded_field_kwargs`` to support custom ``OneToOneField`` that have additional arguments that don't exist on ``ForeignKey``. (gh-870) diff --git a/docs/querying_history.rst b/docs/querying_history.rst index aa1798eff..5e360402a 100644 --- a/docs/querying_history.rst +++ b/docs/querying_history.rst @@ -98,16 +98,44 @@ This will change the ``poll`` instance to have the data from the as_of ----- -This method will return an instance of the model as it would have existed at -the provided date and time. +The HistoryManager allows you to query a point in time for the latest historical +records or instances. When called on an instance's history manager, the ``as_of`` +method will return the instance from the specified point in time, if the instance +existed at that time, or raise DoesNotExist. When called on a model's history +manager, the ``as_of`` method will return instances from a specific date and time +that you specify, returning a queryset that you can use to further filter the result. .. code-block:: pycon - >>> from datetime import datetime - >>> poll.history.as_of(datetime(2010, 10, 25, 18, 4, 0)) - - >>> poll.history.as_of(datetime(2010, 10, 25, 18, 5, 0)) - + >>> t0 = datetime.now() + >>> document1 = RankedDocument.objects.create(rank=42) + >>> document2 = RankedDocument.objects.create(rank=84) + >>> t1 = datetime.now() + + >>> RankedDocument.history.as_of(t1) + , + + ]> + + >>> RankedDocument.history.as_of(t1).filter(rank__lte=50) + + ]> + +``as_of`` is a convenience: the following two queries are identical. + +.. code-block:: pycon + + RankedDocument.history.as_of(t1) + RankedDocument.history.filter(history_date__lte=t1).latest_of_each().as_instances() + +If you filter by `pk` the behavior depends on whether the queryset is +returning instances or historical records. When the queryset is returning +instances, `pk` is mapped to the original model's primary key field. +When the queryset is returning historical records, `pk` refers to the +`history_id` primary key. + most_recent ----------- diff --git a/simple_history/manager.py b/simple_history/manager.py index 5e52414fc..3349d24f9 100644 --- a/simple_history/manager.py +++ b/simple_history/manager.py @@ -1,6 +1,6 @@ from django.conf import settings from django.db import connection, models -from django.db.models import OuterRef, Subquery +from django.db.models import OuterRef, QuerySet, Subquery from django.utils import timezone from simple_history.utils import ( @@ -9,14 +9,113 @@ ) +class HistoricalQuerySet(QuerySet): + """ + Enables additional functionality when working with historical records. + + For additional history on this topic, see: + - https://github.com/jazzband/django-simple-history/pull/229 + - https://github.com/jazzband/django-simple-history/issues/354 + - https://github.com/jazzband/django-simple-history/issues/397 + """ + + def __init__(self, *args, **kwargs): + super(HistoricalQuerySet, self).__init__(*args, **kwargs) + self._as_instances = False + self._pk_attr = self.model.instance_type._meta.pk.attname + + def as_instances(self): + """ + Return a queryset that generates instances instead of historical records. + Queries against the resulting queryset will translate `pk` into the + primary key field of the original type. + + Returns a queryset. + """ + if not self._as_instances: + result = self.exclude(history_type="-") + result._as_instances = True + else: + result = self._clone() + return result + + def filter(self, *args, **kwargs): + """ + If a `pk` filter arrives and the queryset is returning instances + then the caller actually wants to filter based on the original + type's primary key, and not the history_id (historical record's + primary key); this happens frequently with DRF. + """ + if self._as_instances and "pk" in kwargs: + kwargs[self._pk_attr] = kwargs.pop("pk") + return super().filter(*args, **kwargs) + + def latest_of_each(self): + """ + Ensures results in the queryset are the latest historical record for each + primary key. Deletions are not removed. + + Returns a queryset. + """ + # If using MySQL, need to get a list of IDs in memory and then use them for the + # second query. + # Does mean two loops through the DB to get the full set, but still a speed + # improvement. + backend = connection.vendor + if backend == "mysql": + history_ids = {} + for item in self.order_by("-history_date", "-pk"): + if getattr(item, self._pk_attr) not in history_ids: + history_ids[getattr(item, self._pk_attr)] = item.pk + latest_historics = self.filter(history_id__in=history_ids.values()) + elif backend == "postgresql": + latest_pk_attr_historic_ids = ( + self.order_by(self._pk_attr, "-history_date", "-pk") + .distinct(self._pk_attr) + .values_list("pk", flat=True) + ) + latest_historics = self.filter(history_id__in=latest_pk_attr_historic_ids) + else: + latest_pk_attr_historic_ids = ( + self.filter(**{self._pk_attr: OuterRef(self._pk_attr)}) + .order_by("-history_date", "-pk") + .values("pk")[:1] + ) + latest_historics = self.filter( + history_id__in=Subquery(latest_pk_attr_historic_ids) + ) + return latest_historics + + def _clone(self): + c = super()._clone() + c._as_instances = self._as_instances + c._pk_attr = self._pk_attr + return c + + def _fetch_all(self): + super()._fetch_all() + self._instanceize() + + def _instanceize(self): + """ + Convert the result cache to instances if possible and it has not already been + done. If a query extracts `.values(...)` then the result cache will not contain + historical objects to be converted. + """ + if ( + self._result_cache + and self._as_instances + and isinstance(self._result_cache[0], self.model) + ): + self._result_cache = [item.instance for item in self._result_cache] + + class HistoryDescriptor: def __init__(self, model): self.model = model def __get__(self, instance, owner): - if instance is None: - return HistoryManager(self.model) - return HistoryManager(self.model, instance) + return HistoryManager.from_queryset(HistoricalQuerySet)(self.model, instance) class HistoryManager(models.Manager): @@ -66,16 +165,41 @@ def most_recent(self): return self.instance.__class__(**values) def as_of(self, date): - """Get a snapshot as of a specific date. + """ + Get a snapshot as of a specific date. + + When this is used on an instance, it will return the instance based + on the specific date. If the instance did not exist yet, or had been + deleted, then a DoesNotExist error is railed. + + When this is used on a model's history manager, the resulting queryset + will locate the most recent historical record before the specified date + for each primary key, generating instances. If the most recent historical + record is a deletion, that instance is dropped from the result. + + A common usage pattern for querying is to accept an optional time + point `date` and then use: + + `qs = .history.as_of(date) if date else .objects` - Returns an instance, or an iterable of the instances, of the - original model with all the attributes set according to what - was present on the object on the date provided. + after which point one can add filters, values - anything a normal + queryset would support. + + To retrieve historical records, query the model's history directly; + for example: + `qs = .history.filter(history_date__lte=date, pk=...)` + + To retrieve the most recent historical record, including deletions, + you could then use: + `qs = qs.latest_of_each()` """ - if not self.instance: - return self._as_of_set(date) queryset = self.get_queryset().filter(history_date__lte=date) + if not self.instance: + queryset = queryset.latest_of_each().as_instances() + return queryset + try: + # historical records are sorted in reverse chronological order history_obj = queryset[0] except IndexError: raise self.instance.DoesNotExist( @@ -87,43 +211,6 @@ def as_of(self, date): ) return history_obj.instance - def _as_of_set(self, date): - model = type(self.model().instance) # a bit of a hack to get the model - pk_attr = model._meta.pk.name - queryset = self.get_queryset().filter(history_date__lte=date) - # If using MySQL, need to get a list of IDs in memory and then use them for the - # second query. - # Does mean two loops through the DB to get the full set, but still a speed - # improvement. - backend = connection.vendor - if backend == "mysql": - history_ids = {} - for item in queryset.order_by("-history_date", "-pk"): - if getattr(item, pk_attr) not in history_ids: - history_ids[getattr(item, pk_attr)] = item.pk - latest_historics = queryset.filter(history_id__in=history_ids.values()) - elif backend == "postgresql": - latest_pk_attr_historic_ids = ( - queryset.order_by(pk_attr, "-history_date", "-pk") - .distinct(pk_attr) - .values_list("pk", flat=True) - ) - latest_historics = queryset.filter( - history_id__in=latest_pk_attr_historic_ids - ) - else: - latest_pk_attr_historic_ids = ( - queryset.filter(**{pk_attr: OuterRef(pk_attr)}) - .order_by("-history_date", "-pk") - .values("pk")[:1] - ) - latest_historics = queryset.filter( - history_id__in=Subquery(latest_pk_attr_historic_ids) - ) - adjusted = latest_historics.exclude(history_type="-").order_by(pk_attr) - for historic_item in adjusted: - yield historic_item.instance - def bulk_history_create( self, objs, diff --git a/simple_history/tests/models.py b/simple_history/tests/models.py index 993ee7a76..67b877845 100644 --- a/simple_history/tests/models.py +++ b/simple_history/tests/models.py @@ -251,6 +251,7 @@ class Document(models.Model): changed_by = models.ForeignKey( User, on_delete=models.CASCADE, null=True, blank=True ) + history = HistoricalRecords() @property @@ -269,6 +270,12 @@ def _history_user(self, value): self.changed_by = value +class RankedDocument(Document): + rank = models.IntegerField(default=50) + + history = HistoricalRecords() + + class Profile(User): date_of_birth = models.DateField() diff --git a/simple_history/tests/tests/test_manager.py b/simple_history/tests/tests/test_manager.py index c093b537b..f2d1c66ad 100644 --- a/simple_history/tests/tests/test_manager.py +++ b/simple_history/tests/tests/test_manager.py @@ -5,7 +5,7 @@ from django.db import IntegrityError from django.test import TestCase, override_settings, skipUnlessDBFeature -from ..models import Document, Poll +from ..models import Document, Poll, RankedDocument User = get_user_model() @@ -76,13 +76,98 @@ def test_create_and_delete(self): doc_change.history_date = now doc_change.save() docs_as_of_tmw = Document.history.as_of(now + timedelta(days=1)) - self.assertFalse(list(docs_as_of_tmw)) + with self.assertNumQueries(1): + self.assertFalse(list(docs_as_of_tmw)) def test_multiple(self): document1 = Document.objects.create() document2 = Document.objects.create() historical = Document.history.as_of(datetime.now() + timedelta(days=1)) - self.assertEqual(list(historical), [document1, document2]) + # history, even converted to objects, is kept in reverse chronological order + # because sorting it based on the original table's meta ordering is not possible + # when the ordering leverages foreign key relationships + with self.assertNumQueries(1): + self.assertEqual(list(historical), [document2, document1]) + + def test_filter_pk_as_instance(self): + # when a queryset is returning historical documents, `pk` queries + # reference the history_id; however when a queryset is returning + # instances, `pk' queries reference the original table's primary key + document1 = RankedDocument.objects.create(id=101, rank=42) + RankedDocument.objects.create(id=102, rank=84) + self.assertFalse(RankedDocument.history.filter(pk=document1.id)) + self.assertTrue( + RankedDocument.history.all().as_instances().filter(pk=document1.id) + ) + + def test_as_of(self): + """Demonstrates how as_of works now that it returns a QuerySet.""" + t0 = datetime.now() + document1 = RankedDocument.objects.create(rank=42) + document2 = RankedDocument.objects.create(rank=84) + t1 = datetime.now() + document2.rank = 51 + document2.save() + document1.delete() + t2 = datetime.now() + + # nothing exists at t0 + queryset = RankedDocument.history.as_of(t0) + self.assertEqual(queryset.count(), 0) + + # at t1, two records exist + queryset = RankedDocument.history.as_of(t1) + self.assertEqual(queryset.count(), 2) + self.assertEqual(queryset.filter(rank__gte=75).count(), 1) + ids = set([item["id"] for item in queryset.values("id")]) + self.assertEqual(ids, {document1.id, document2.id}) + + # at t2 we have one record left + queryset = RankedDocument.history.as_of(t2) + self.assertEqual(queryset.count(), 1) + self.assertEqual(queryset.filter(rank__gte=75).count(), 0) + + def test_historical_query_set(self): + """ + Demonstrates how the HistoricalQuerySet works to provide as_of functionality. + """ + document1 = RankedDocument.objects.create(rank=42) + document2 = RankedDocument.objects.create(rank=84) + document2.rank = 51 + document2.save() + document1.delete() + t2 = datetime.now() + + # look for historical records, get back a queryset + with self.assertNumQueries(1): + queryset = RankedDocument.history.filter(history_date__lte=t2) + self.assertEqual(queryset.count(), 4) + + # only want the most recend records (provided by HistoricalQuerySet) + self.assertEqual(queryset.latest_of_each().count(), 2) + + # want to see the instances as of that time? + self.assertEqual(queryset.latest_of_each().as_instances().count(), 1) + + # these new methods are idempotent + self.assertEqual( + queryset.latest_of_each() + .latest_of_each() + .as_instances() + .as_instances() + .count(), + 1, + ) + + # that was all the same as calling as_of! + self.assertEqual( + set( + RankedDocument.history.filter(history_date__lte=t2) + .latest_of_each() + .as_instances() + ), + set(RankedDocument.history.as_of(t2)), + ) class BulkHistoryCreateTestCase(TestCase):