Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Django command for generating waveforms #530

Merged
merged 13 commits into from
Mar 8, 2022
3 changes: 3 additions & 0 deletions api/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ sphinx = "*"
sphinx-autobuild = "*"
furo = "*"
myst-parser = "*"
factory-boy = "*"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a nice addition.


[packages]
aws-requests-auth = "*"
Expand Down Expand Up @@ -49,6 +50,8 @@ python-decouple = "*"
django-cron = "*"
gunicorn = "*"
sentry-sdk = "*"
django-tqdm = "*"
limit = "*"

[requires]
python_version = "3.10"
87 changes: 67 additions & 20 deletions api/Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

127 changes: 127 additions & 0 deletions api/catalog/management/commands/generatewaveforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import logging
import subprocess

from catalog.api.models.audio import Audio, AudioAddOn
from django_tqdm import BaseCommand
from limit import limit


def paginate_reducing_query(get_query_set, page_size=10):
"""
We can't use `Paginator` because it can't handle the situation
where the query result changes each time a page is accessed.
Because the `audios` QuerySet result is naturally getting smaller
each time we successfully process waveforms, we can just take
the first ten for each "page" until the page comes back empty.
This should theoretically be faster/less DB latency inducing
anyway as we're never going to have huge OFFSET values to
access deep pages.
"""
page = list(get_query_set()[0:page_size])
while len(page):
yield page
page = list(get_query_set()[0:page_size])


class Command(BaseCommand):
help = "Generates waveforms for all audio records to populate the cache."
"""
Note: We rely on the file download and waveform generation times
taking long enough to prevent us from either making too many requests
to the upstream provider or inserting into our database too quickly and
causing a slow down. In local tests and in tests run on the staging server
it appeared to take on average around 6 to 8 seconds for each audio file.
Copy link
Contributor

@obulat obulat Mar 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should depend on the size of the audio file. Many Freesound and Wikimedia files are very small. I sometimes see 2-3 it/s in the console.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I think we probably should use the limit library anyway just in case.

That should be enough latency to not cause any problems.
"""

def add_arguments(self, parser):
parser.add_argument(
"--no_rate_limit", help="Remove self impose rate limits for testing."
)
parser.add_argument(
"--max_records", help="Limit the number of waveforms to create.", type=int
)

def get_audio_handler(self, options):
if options["no_rate_limit"]:
return lambda audio: audio.get_or_create_waveform()

@limit(limit=1, every=2) # Call once per two seconds maximum
def limited(audio):
audio.get_or_create_waveform()

return limited

def _process_wavelengths(self, audios, audio_handler, count_to_process):
errored_identifiers = []
processed = 0
with self.tqdm(total=count_to_process) as progress:
paginator = paginate_reducing_query(
get_query_set=lambda: audios.exclude(identifier__in=errored_identifiers)
)
for page in paginator:
for audio in page:
if processed > count_to_process:
return errored_identifiers
try:
processed += 1
audio_handler(audio)
except subprocess.CalledProcessError as err:
errored_identifiers.append(audio.identifier)
self.error(
f"Unable to process {audio.identifier}: "
f"{err.stderr.decode().strip()}"
)
except KeyboardInterrupt:
errored_identifiers.append(audio.identifier)
return errored_identifiers
except BaseException as err:
errored_identifiers.append(audio.identifier)
self.error(f"Unable to process {audio.identifier}: " f"{err}")
progress.update(1)

return errored_identifiers

def handle(self, *args, **options):
# These logs really muck up the tqdm output and don't give us much helpful
# information, so they get silenced
logging.getLogger("catalog.api.utils.waveform").setLevel(logging.WARNING)

existing_waveform_audio_identifiers_query = AudioAddOn.objects.filter(
waveform_peaks__isnull=False
).values_list("audio_identifier", flat=True)
audios = Audio.objects.exclude(
identifier__in=existing_waveform_audio_identifiers_query
).order_by("id")
Comment on lines +93 to +95
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we taking this approach because Audio and AudioAddOn don't have a foreign key relationship?

Copy link
Contributor

@sarayourfriend sarayourfriend Mar 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By "this approach" what do you mean? The inner select on the add ons table?

The tables don't have FK relationship though, you're right, at least not one the DB knows about. Trying to guess what your question was about, I don't think there's a JOIN we could do here, just an inner select (though for clarity for anyone reading this who isn't familiar with Django ORM, the inner select will be executed in the DB as an actual inner-SELECT, not Python-side).

Copy link
Contributor Author

@AetherUnbound AetherUnbound Mar 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I meant that I could have sworn we could do something like

Audio.objects.filter(audioaddon__waveform_peaks__isnull=True)

if these were foreign keys. They're not, so I don't think Django would be able to interpret that relationship appropriately.

And sorry for the lack of context, I had that initially but second guessed myself and deleted the entire comment 🤦‍♀️

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohh yes no that's not possible unfortunately because without the explicit relationship field Django won't add the reverse field relationships you'd need to make that JOIN.


max_records = options["max_records"]
count = audios.count()

count_to_process = count

if max_records is not None:
count_to_process = max_records if max_records < count else count

self.info(
self.style.NOTICE(f"Generating waveforms for {count_to_process:,} records")
)

audio_handler = self.get_audio_handler(options)

errored_identifiers = self._process_wavelengths(
audios, audio_handler, count_to_process
)

self.info(self.style.SUCCESS("Finished generating waveforms!"))

if errored_identifiers:
errored_identifiers_joined = "\n".join(
str(identifier) for identifier in errored_identifiers
)

self.info(
self.style.WARNING(
f"The following Audio identifiers were unable "
f"to be processed\n\n{errored_identifiers_joined}"
)
)
Empty file added api/test/factory/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions api/test/factory/faker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from factory import Faker
from faker.providers import BaseProvider
from faker.utils.distribution import choices_distribution


class WaveformProvider(BaseProvider):
_float_space = [x / 100.0 for x in range(101)] * 20

@classmethod
def generate_waveform(cls) -> list[float]:
return choices_distribution(cls._float_space, p=None, length=1000)

def waveform(self) -> list[float]:
return WaveformProvider.generate_waveform()


Faker.add_provider(WaveformProvider)
Empty file.
19 changes: 19 additions & 0 deletions api/test/factory/models/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from test.factory.faker import Faker
from test.factory.models.media import IdentifierFactory, MediaFactory

from catalog.api.models.audio import Audio, AudioAddOn
from factory.django import DjangoModelFactory


class AudioFactory(MediaFactory):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Super neat!

class Meta:
model = Audio


class AudioAddOnFactory(DjangoModelFactory):
class Meta:
model = AudioAddOn

audio_identifier = IdentifierFactory(AudioFactory)

waveform_peaks = Faker("waveform")
Loading