Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Commit

Permalink
Django command for generating waveforms (#530)
Browse files Browse the repository at this point in the history
* Add django-tqdm dependency

* Add waveform generation command

* Incorporate error handling

* Paginate generatewaveforms

* Lint and test different exception types

* Lint

* Fix bad merge

* Fix test cases

* Correctly handle keyboard interrupt and impose self rate limit

* Move comments to prevent line breaks

* Use clearer argument names

Kudos to @AetherUnbound for coming up with much better
ones than I was able to.

* Add back in destructive logging until we fix this for CI

* Re-lock pipfile

Co-authored-by: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com>
  • Loading branch information
AetherUnbound and sarayourfriend authored Mar 8, 2022
1 parent c96703b commit f74c128
Show file tree
Hide file tree
Showing 13 changed files with 455 additions and 23 deletions.
3 changes: 3 additions & 0 deletions api/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ sphinx = "*"
sphinx-autobuild = "*"
furo = "*"
myst-parser = "*"
factory-boy = "*"

[packages]
aws-requests-auth = "*"
Expand Down Expand Up @@ -49,6 +50,8 @@ python-decouple = "*"
django-cron = "*"
gunicorn = "*"
sentry-sdk = "*"
django-tqdm = "*"
limit = "*"

[requires]
python_version = "3.10"
87 changes: 67 additions & 20 deletions api/Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

127 changes: 127 additions & 0 deletions api/catalog/management/commands/generatewaveforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import logging
import subprocess

from catalog.api.models.audio import Audio, AudioAddOn
from django_tqdm import BaseCommand
from limit import limit


def paginate_reducing_query(get_query_set, page_size=10):
"""
We can't use `Paginator` because it can't handle the situation
where the query result changes each time a page is accessed.
Because the `audios` QuerySet result is naturally getting smaller
each time we successfully process waveforms, we can just take
the first ten for each "page" until the page comes back empty.
This should theoretically be faster/less DB latency inducing
anyway as we're never going to have huge OFFSET values to
access deep pages.
"""
page = list(get_query_set()[0:page_size])
while len(page):
yield page
page = list(get_query_set()[0:page_size])


class Command(BaseCommand):
help = "Generates waveforms for all audio records to populate the cache."
"""
Note: We rely on the file download and waveform generation times
taking long enough to prevent us from either making too many requests
to the upstream provider or inserting into our database too quickly and
causing a slow down. In local tests and in tests run on the staging server
it appeared to take on average around 6 to 8 seconds for each audio file.
That should be enough latency to not cause any problems.
"""

def add_arguments(self, parser):
parser.add_argument(
"--no_rate_limit", help="Remove self impose rate limits for testing."
)
parser.add_argument(
"--max_records", help="Limit the number of waveforms to create.", type=int
)

def get_audio_handler(self, options):
if options["no_rate_limit"]:
return lambda audio: audio.get_or_create_waveform()

@limit(limit=1, every=2) # Call once per two seconds maximum
def limited(audio):
audio.get_or_create_waveform()

return limited

def _process_wavelengths(self, audios, audio_handler, count_to_process):
errored_identifiers = []
processed = 0
with self.tqdm(total=count_to_process) as progress:
paginator = paginate_reducing_query(
get_query_set=lambda: audios.exclude(identifier__in=errored_identifiers)
)
for page in paginator:
for audio in page:
if processed > count_to_process:
return errored_identifiers
try:
processed += 1
audio_handler(audio)
except subprocess.CalledProcessError as err:
errored_identifiers.append(audio.identifier)
self.error(
f"Unable to process {audio.identifier}: "
f"{err.stderr.decode().strip()}"
)
except KeyboardInterrupt:
errored_identifiers.append(audio.identifier)
return errored_identifiers
except BaseException as err:
errored_identifiers.append(audio.identifier)
self.error(f"Unable to process {audio.identifier}: " f"{err}")
progress.update(1)

return errored_identifiers

def handle(self, *args, **options):
# These logs really muck up the tqdm output and don't give us much helpful
# information, so they get silenced
logging.getLogger("catalog.api.utils.waveform").setLevel(logging.WARNING)

existing_waveform_audio_identifiers_query = AudioAddOn.objects.filter(
waveform_peaks__isnull=False
).values_list("audio_identifier", flat=True)
audios = Audio.objects.exclude(
identifier__in=existing_waveform_audio_identifiers_query
).order_by("id")

max_records = options["max_records"]
count = audios.count()

count_to_process = count

if max_records is not None:
count_to_process = max_records if max_records < count else count

self.info(
self.style.NOTICE(f"Generating waveforms for {count_to_process:,} records")
)

audio_handler = self.get_audio_handler(options)

errored_identifiers = self._process_wavelengths(
audios, audio_handler, count_to_process
)

self.info(self.style.SUCCESS("Finished generating waveforms!"))

if errored_identifiers:
errored_identifiers_joined = "\n".join(
str(identifier) for identifier in errored_identifiers
)

self.info(
self.style.WARNING(
f"The following Audio identifiers were unable "
f"to be processed\n\n{errored_identifiers_joined}"
)
)
Empty file added api/test/factory/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions api/test/factory/faker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from factory import Faker
from faker.providers import BaseProvider
from faker.utils.distribution import choices_distribution


class WaveformProvider(BaseProvider):
_float_space = [x / 100.0 for x in range(101)] * 20

@classmethod
def generate_waveform(cls) -> list[float]:
return choices_distribution(cls._float_space, p=None, length=1000)

def waveform(self) -> list[float]:
return WaveformProvider.generate_waveform()


Faker.add_provider(WaveformProvider)
Empty file.
19 changes: 19 additions & 0 deletions api/test/factory/models/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from test.factory.faker import Faker
from test.factory.models.media import IdentifierFactory, MediaFactory

from catalog.api.models.audio import Audio, AudioAddOn
from factory.django import DjangoModelFactory


class AudioFactory(MediaFactory):
class Meta:
model = Audio


class AudioAddOnFactory(DjangoModelFactory):
class Meta:
model = AudioAddOn

audio_identifier = IdentifierFactory(AudioFactory)

waveform_peaks = Faker("waveform")
Loading

0 comments on commit f74c128

Please sign in to comment.