Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Duplicate history cleanup #483

Merged
merged 9 commits into from
Dec 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Authors
- David Grochowski (`ThePumpingLemma <https://github.com/ThePumpingLemma>`_)
- David Hite
- Eduardo Cuducos
- Filipe Pina (@fopina)
- Florian Eßer
- Frank Sachsenheim
- George Vilches
Expand Down
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Changes
- Add german translations (gh-484)
- Add `extra_context` parameter to history_form_view (gh-467)
- Fixed bug that prevented `next_record` and `prev_record` to work with custom manager names (gh-501)
- Added management command `clean_duplicate_history` to remove duplicate history entries (gh-483)

2.5.1 (2018-10-19)
------------------
Expand Down
22 changes: 22 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,3 +328,25 @@ And to revert to that ``HistoricalPoll`` instance, we can do:
This will change the ``poll`` instance to have the data from the
``HistoricalPoll`` object and it will create a new row in the
``HistoricalPoll`` table indicating that a new change has been made.

Duplicate history entries
~~~~~~~~~~~~~~~~~~~~~~~~~

For performance reasons, ``django-simple-history`` always creates an ``HistoricalRecord``
when ``Model.save()`` is called regardless of data having actually changed.
If you find yourself with a lot of history duplicates you can schedule the
``clean_duplicate_history`` command

.. code-block:: bash

$ python manage.py clean_duplicate_history --auto

You can use ``--auto`` to clean up duplicates for every model
with ``HistoricalRecords`` or enumerate specific models as args.
There is also ``-m/--minutes`` to specify how many minutes to go
back in history while searching (default checks whole history),
so you can schedule, for instance, an hourly cronjob such as

.. code-block:: bash

$ python manage.py clean_duplicate_history -m 60 --auto
110 changes: 110 additions & 0 deletions simple_history/management/commands/clean_duplicate_history.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from django.utils import timezone
from django.db import transaction

from . import populate_history
from ... import models, utils
from ...exceptions import NotHistoricalModelError


class Command(populate_history.Command):
args = "<app.model app.model ...>"
help = (
"Scans HistoricalRecords for identical sequencial entries "
"(duplicates) in a model and deletes them."
)

DONE_CLEANING_FOR_MODEL = "Removed {count} historical records for {model}\n"

def add_arguments(self, parser):
parser.add_argument("models", nargs="*", type=str)
parser.add_argument(
"--auto",
action="store_true",
dest="auto",
default=False,
help="Automatically search for models with the HistoricalRecords field "
"type",
)
parser.add_argument(
"-d", "--dry", action="store_true", help="Dry (test) run only, no changes"
)
parser.add_argument(
"-m", "--minutes", type=int, help="Only search the last MINUTES of history"
)

def handle(self, *args, **options):
self.verbosity = options["verbosity"]

to_process = set()
model_strings = options.get("models", []) or args

if model_strings:
for model_pair in self._handle_model_list(*model_strings):
to_process.add(model_pair)

elif options["auto"]:
to_process = self._auto_models()

else:
self.log(self.COMMAND_HINT)

self._process(to_process, date_back=options["minutes"], dry_run=options["dry"])

def _process(self, to_process, date_back=None, dry_run=True):
if date_back:
stop_date = timezone.now() - timezone.timedelta(minutes=date_back)
fopina marked this conversation as resolved.
Show resolved Hide resolved
else:
stop_date = None

for model, history_model in to_process:
m_qs = history_model.objects
if stop_date:
m_qs = m_qs.filter(history_date__gte=stop_date)
found = m_qs.count()
self.log("{0} has {1} historical entries".format(model, found), 2)
if not found:
continue

# it would be great if we could just iterate over the instances that
# have changes (in the given period) but
# `m_qs.values(model._meta.pk.name).distinct()`
# is actually slower than looping all and filtering in the code...
for o in model.objects.all():
self._process_instance(o, model, stop_date=stop_date, dry_run=dry_run)

def _process_instance(self, instance, model, stop_date=None, dry_run=True):
entries_deleted = 0
o_qs = instance.history.all()
if stop_date:
# to compare last history match
extra_one = o_qs.filter(history_date__lte=stop_date).first()
o_qs = o_qs.filter(history_date__gte=stop_date)
else:
extra_one = None
with transaction.atomic():
# ordering is ('-history_date', '-history_id') so this is ok
f1 = o_qs.first()
if not f1:
return

for f2 in o_qs[1:]:
entries_deleted += self._check_and_delete(f1, f2, dry_run)
f1 = f2
if extra_one:
entries_deleted += self._check_and_delete(f1, extra_one, dry_run)

self.log(
self.DONE_CLEANING_FOR_MODEL.format(model=model, count=entries_deleted)
)

def log(self, message, verbosity_level=1):
if self.verbosity >= verbosity_level:
self.stdout.write(message)

def _check_and_delete(self, entry1, entry2, dry_run=True):
delta = entry1.diff_against(entry2)
if not delta.changed_fields:
if not dry_run:
entry1.delete()
return 1
return 0
23 changes: 14 additions & 9 deletions simple_history/management/commands/populate_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,22 +55,27 @@ def handle(self, *args, **options):
to_process.add(model_pair)

elif options["auto"]:
for model in models.registered_models.values():
try: # avoid issues with mutli-table inheritance
history_model = utils.get_history_model_for_model(model)
except NotHistoricalModelError:
continue
to_process.add((model, history_model))
if not to_process:
if self.verbosity >= 1:
self.stdout.write(self.NO_REGISTERED_MODELS)
to_process = self._auto_models()

else:
if self.verbosity >= 1:
self.stdout.write(self.COMMAND_HINT)

self._process(to_process, batch_size=options["batchsize"])

def _auto_models(self):
to_process = set()
for model in models.registered_models.values():
try: # avoid issues with multi-table inheritance
history_model = utils.get_history_model_for_model(model)
except NotHistoricalModelError:
continue
to_process.add((model, history_model))
if not to_process:
if self.verbosity >= 1:
self.stdout.write(self.NO_REGISTERED_MODELS)
return to_process

def _handle_model_list(self, *args):
failing = False
for natural_key in args:
Expand Down
183 changes: 179 additions & 4 deletions simple_history/tests/tests/test_commands.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from contextlib import contextmanager
from datetime import datetime
from datetime import datetime, timedelta

from django.core import management
from django.test import TestCase
from six.moves import cStringIO as StringIO

from simple_history import models as sh_models
from simple_history.management.commands import populate_history
from ..models import Book, Poll, PollWithExcludeFields, Restaurant
from simple_history.management.commands import populate_history, clean_duplicate_history
from ..models import Book, Poll, PollWithExcludeFields, Restaurant, Place


@contextmanager
Expand Down Expand Up @@ -118,7 +118,7 @@ def test_existing_objects(self):

def test_no_historical(self):
out = StringIO()
with replace_registry():
with replace_registry({"test_place": Place}):
management.call_command(self.command_name, auto=True, stdout=out)
self.assertIn(populate_history.Command.NO_REGISTERED_MODELS, out.getvalue())

Expand Down Expand Up @@ -180,3 +180,178 @@ def test_excluded_fields(self):
)
initial_history_record = PollWithExcludeFields.history.all()[0]
self.assertEqual(initial_history_record.question, poll.question)


class TestCleanDuplicateHistory(TestCase):
command_name = "clean_duplicate_history"
command_error = (management.CommandError, SystemExit)

def test_no_args(self):
out = StringIO()
management.call_command(self.command_name, stdout=out, stderr=StringIO())
self.assertIn(clean_duplicate_history.Command.COMMAND_HINT, out.getvalue())

def test_bad_args(self):
test_data = (
(clean_duplicate_history.Command.MODEL_NOT_HISTORICAL, ("tests.place",)),
(clean_duplicate_history.Command.MODEL_NOT_FOUND, ("invalid.model",)),
(clean_duplicate_history.Command.MODEL_NOT_FOUND, ("bad_key",)),
)
for msg, args in test_data:
out = StringIO()
self.assertRaises(
self.command_error,
management.call_command,
self.command_name,
*args,
stdout=StringIO(),
stderr=out
)
self.assertIn(msg, out.getvalue())

def test_no_historical(self):
out = StringIO()
with replace_registry({"test_place": Place}):
management.call_command(self.command_name, auto=True, stdout=out)
self.assertIn(
clean_duplicate_history.Command.NO_REGISTERED_MODELS, out.getvalue()
)

def test_auto_dry_run(self):
p = Poll.objects.create(
question="Will this be deleted?", pub_date=datetime.now()
)
p.save()

# not related to dry_run test, just for increasing coverage :)
# create instance with single-entry history older than "minutes"
# so it is skipped
p = Poll.objects.create(
question="Will this be deleted?", pub_date=datetime.now()
)
h = p.history.first()
h.history_date -= timedelta(hours=1)
h.save()

self.assertEqual(Poll.history.all().count(), 3)
out = StringIO()
management.call_command(
self.command_name,
auto=True,
minutes=50,
dry=True,
stdout=out,
stderr=StringIO(),
)
self.assertEqual(
out.getvalue(),
"Removed 1 historical records for "
"<class 'simple_history.tests.models.Poll'>\n",
)
self.assertEqual(Poll.history.all().count(), 3)

def test_auto_cleanup(self):
p = Poll.objects.create(
question="Will this be deleted?", pub_date=datetime.now()
)
self.assertEqual(Poll.history.all().count(), 1)
p.save()
self.assertEqual(Poll.history.all().count(), 2)
p.question = "Maybe this one won't...?"
p.save()
self.assertEqual(Poll.history.all().count(), 3)
out = StringIO()
management.call_command(
self.command_name, auto=True, stdout=out, stderr=StringIO()
)
self.assertEqual(
out.getvalue(),
"Removed 1 historical records for "
"<class 'simple_history.tests.models.Poll'>\n",
)
self.assertEqual(Poll.history.all().count(), 2)

def test_auto_cleanup_verbose(self):
p = Poll.objects.create(
question="Will this be deleted?", pub_date=datetime.now()
)
self.assertEqual(Poll.history.all().count(), 1)
p.save()
p.question = "Maybe this one won't...?"
p.save()
self.assertEqual(Poll.history.all().count(), 3)
out = StringIO()
management.call_command(
self.command_name,
"tests.poll",
auto=True,
verbosity=2,
stdout=out,
stderr=StringIO(),
)
self.assertEqual(
out.getvalue(),
"<class 'simple_history.tests.models.Poll'> has 3 historical entries\n"
"Removed 1 historical records for "
"<class 'simple_history.tests.models.Poll'>\n",
)
self.assertEqual(Poll.history.all().count(), 2)

def test_auto_cleanup_dated(self):
the_time_is_now = datetime.now()
p = Poll.objects.create(
question="Will this be deleted?", pub_date=the_time_is_now
)
self.assertEqual(Poll.history.all().count(), 1)
p.save()
p.save()
self.assertEqual(Poll.history.all().count(), 3)
p.question = "Or this one...?"
p.save()
p.save()
self.assertEqual(Poll.history.all().count(), 5)

for h in Poll.history.all()[2:]:
h.history_date -= timedelta(hours=1)
h.save()

management.call_command(
self.command_name,
auto=True,
minutes=50,
stdout=StringIO(),
stderr=StringIO(),
)
self.assertEqual(Poll.history.all().count(), 4)

def test_auto_cleanup_dated_extra_one(self):
the_time_is_now = datetime.now()
p = Poll.objects.create(
question="Will this be deleted?", pub_date=the_time_is_now
)
self.assertEqual(Poll.history.all().count(), 1)
p.save()
p.save()
self.assertEqual(Poll.history.all().count(), 3)
p.question = "Or this one...?"
p.save()
p.save()
p.save()
p.save()
self.assertEqual(Poll.history.all().count(), 7)

for h in Poll.history.all()[2:]:
h.history_date -= timedelta(hours=1)
h.save()

management.call_command(
self.command_name,
auto=True,
minutes=50,
stdout=StringIO(),
stderr=StringIO(),
)
# even though only the last 2 entries match the date range
# the "extra_one" (the record before the oldest match)
# is identical to the oldest match, so oldest match is deleted
self.assertEqual(Poll.history.all().count(), 5)