This repository has been archived by the owner on Feb 22, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Django command for generating waveforms (#530)
* Add django-tqdm dependency * Add waveform generation command * Incorporate error handling * Paginate generatewaveforms * Lint and test different exception types * Lint * Fix bad merge * Fix test cases * Correctly handle keyboard interrupt and impose self rate limit * Move comments to prevent line breaks * Use clearer argument names Kudos to @AetherUnbound for coming up with much better ones than I was able to. * Add back in destructive logging until we fix this for CI * Re-lock pipfile Co-authored-by: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com>
- Loading branch information
1 parent
c96703b
commit f74c128
Showing
13 changed files
with
455 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
import logging | ||
import subprocess | ||
|
||
from catalog.api.models.audio import Audio, AudioAddOn | ||
from django_tqdm import BaseCommand | ||
from limit import limit | ||
|
||
|
||
def paginate_reducing_query(get_query_set, page_size=10): | ||
""" | ||
We can't use `Paginator` because it can't handle the situation | ||
where the query result changes each time a page is accessed. | ||
Because the `audios` QuerySet result is naturally getting smaller | ||
each time we successfully process waveforms, we can just take | ||
the first ten for each "page" until the page comes back empty. | ||
This should theoretically be faster/less DB latency inducing | ||
anyway as we're never going to have huge OFFSET values to | ||
access deep pages. | ||
""" | ||
page = list(get_query_set()[0:page_size]) | ||
while len(page): | ||
yield page | ||
page = list(get_query_set()[0:page_size]) | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Generates waveforms for all audio records to populate the cache." | ||
""" | ||
Note: We rely on the file download and waveform generation times | ||
taking long enough to prevent us from either making too many requests | ||
to the upstream provider or inserting into our database too quickly and | ||
causing a slow down. In local tests and in tests run on the staging server | ||
it appeared to take on average around 6 to 8 seconds for each audio file. | ||
That should be enough latency to not cause any problems. | ||
""" | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument( | ||
"--no_rate_limit", help="Remove self impose rate limits for testing." | ||
) | ||
parser.add_argument( | ||
"--max_records", help="Limit the number of waveforms to create.", type=int | ||
) | ||
|
||
def get_audio_handler(self, options): | ||
if options["no_rate_limit"]: | ||
return lambda audio: audio.get_or_create_waveform() | ||
|
||
@limit(limit=1, every=2) # Call once per two seconds maximum | ||
def limited(audio): | ||
audio.get_or_create_waveform() | ||
|
||
return limited | ||
|
||
def _process_wavelengths(self, audios, audio_handler, count_to_process): | ||
errored_identifiers = [] | ||
processed = 0 | ||
with self.tqdm(total=count_to_process) as progress: | ||
paginator = paginate_reducing_query( | ||
get_query_set=lambda: audios.exclude(identifier__in=errored_identifiers) | ||
) | ||
for page in paginator: | ||
for audio in page: | ||
if processed > count_to_process: | ||
return errored_identifiers | ||
try: | ||
processed += 1 | ||
audio_handler(audio) | ||
except subprocess.CalledProcessError as err: | ||
errored_identifiers.append(audio.identifier) | ||
self.error( | ||
f"Unable to process {audio.identifier}: " | ||
f"{err.stderr.decode().strip()}" | ||
) | ||
except KeyboardInterrupt: | ||
errored_identifiers.append(audio.identifier) | ||
return errored_identifiers | ||
except BaseException as err: | ||
errored_identifiers.append(audio.identifier) | ||
self.error(f"Unable to process {audio.identifier}: " f"{err}") | ||
progress.update(1) | ||
|
||
return errored_identifiers | ||
|
||
def handle(self, *args, **options): | ||
# These logs really muck up the tqdm output and don't give us much helpful | ||
# information, so they get silenced | ||
logging.getLogger("catalog.api.utils.waveform").setLevel(logging.WARNING) | ||
|
||
existing_waveform_audio_identifiers_query = AudioAddOn.objects.filter( | ||
waveform_peaks__isnull=False | ||
).values_list("audio_identifier", flat=True) | ||
audios = Audio.objects.exclude( | ||
identifier__in=existing_waveform_audio_identifiers_query | ||
).order_by("id") | ||
|
||
max_records = options["max_records"] | ||
count = audios.count() | ||
|
||
count_to_process = count | ||
|
||
if max_records is not None: | ||
count_to_process = max_records if max_records < count else count | ||
|
||
self.info( | ||
self.style.NOTICE(f"Generating waveforms for {count_to_process:,} records") | ||
) | ||
|
||
audio_handler = self.get_audio_handler(options) | ||
|
||
errored_identifiers = self._process_wavelengths( | ||
audios, audio_handler, count_to_process | ||
) | ||
|
||
self.info(self.style.SUCCESS("Finished generating waveforms!")) | ||
|
||
if errored_identifiers: | ||
errored_identifiers_joined = "\n".join( | ||
str(identifier) for identifier in errored_identifiers | ||
) | ||
|
||
self.info( | ||
self.style.WARNING( | ||
f"The following Audio identifiers were unable " | ||
f"to be processed\n\n{errored_identifiers_joined}" | ||
) | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from factory import Faker | ||
from faker.providers import BaseProvider | ||
from faker.utils.distribution import choices_distribution | ||
|
||
|
||
class WaveformProvider(BaseProvider): | ||
_float_space = [x / 100.0 for x in range(101)] * 20 | ||
|
||
@classmethod | ||
def generate_waveform(cls) -> list[float]: | ||
return choices_distribution(cls._float_space, p=None, length=1000) | ||
|
||
def waveform(self) -> list[float]: | ||
return WaveformProvider.generate_waveform() | ||
|
||
|
||
Faker.add_provider(WaveformProvider) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from test.factory.faker import Faker | ||
from test.factory.models.media import IdentifierFactory, MediaFactory | ||
|
||
from catalog.api.models.audio import Audio, AudioAddOn | ||
from factory.django import DjangoModelFactory | ||
|
||
|
||
class AudioFactory(MediaFactory): | ||
class Meta: | ||
model = Audio | ||
|
||
|
||
class AudioAddOnFactory(DjangoModelFactory): | ||
class Meta: | ||
model = AudioAddOn | ||
|
||
audio_identifier = IdentifierFactory(AudioFactory) | ||
|
||
waveform_peaks = Faker("waveform") |
Oops, something went wrong.