-
Notifications
You must be signed in to change notification settings - Fork 50
Django command for generating waveforms #530
Changes from all commits
b46a656
98768e2
aaebea6
ae7a1b0
b2959f4
aab0205
4d25dc6
9c053cc
9ad8e90
f665ea4
71ddcec
c412147
3d3c05e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
import logging | ||
import subprocess | ||
|
||
from catalog.api.models.audio import Audio, AudioAddOn | ||
from django_tqdm import BaseCommand | ||
from limit import limit | ||
|
||
|
||
def paginate_reducing_query(get_query_set, page_size=10): | ||
""" | ||
We can't use `Paginator` because it can't handle the situation | ||
where the query result changes each time a page is accessed. | ||
Because the `audios` QuerySet result is naturally getting smaller | ||
each time we successfully process waveforms, we can just take | ||
the first ten for each "page" until the page comes back empty. | ||
This should theoretically be faster/less DB latency inducing | ||
anyway as we're never going to have huge OFFSET values to | ||
access deep pages. | ||
""" | ||
page = list(get_query_set()[0:page_size]) | ||
while len(page): | ||
yield page | ||
page = list(get_query_set()[0:page_size]) | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Generates waveforms for all audio records to populate the cache." | ||
""" | ||
Note: We rely on the file download and waveform generation times | ||
taking long enough to prevent us from either making too many requests | ||
to the upstream provider or inserting into our database too quickly and | ||
causing a slow down. In local tests and in tests run on the staging server | ||
it appeared to take on average around 6 to 8 seconds for each audio file. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it should depend on the size of the audio file. Many Freesound and Wikimedia files are very small. I sometimes see 2-3 it/s in the console. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right. I think we probably should use the |
||
That should be enough latency to not cause any problems. | ||
""" | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument( | ||
"--no_rate_limit", help="Remove self impose rate limits for testing." | ||
) | ||
parser.add_argument( | ||
"--max_records", help="Limit the number of waveforms to create.", type=int | ||
) | ||
|
||
def get_audio_handler(self, options): | ||
if options["no_rate_limit"]: | ||
return lambda audio: audio.get_or_create_waveform() | ||
|
||
@limit(limit=1, every=2) # Call once per two seconds maximum | ||
def limited(audio): | ||
audio.get_or_create_waveform() | ||
|
||
return limited | ||
|
||
def _process_wavelengths(self, audios, audio_handler, count_to_process): | ||
errored_identifiers = [] | ||
processed = 0 | ||
with self.tqdm(total=count_to_process) as progress: | ||
paginator = paginate_reducing_query( | ||
get_query_set=lambda: audios.exclude(identifier__in=errored_identifiers) | ||
) | ||
for page in paginator: | ||
for audio in page: | ||
if processed > count_to_process: | ||
return errored_identifiers | ||
try: | ||
processed += 1 | ||
audio_handler(audio) | ||
except subprocess.CalledProcessError as err: | ||
errored_identifiers.append(audio.identifier) | ||
self.error( | ||
f"Unable to process {audio.identifier}: " | ||
f"{err.stderr.decode().strip()}" | ||
) | ||
except KeyboardInterrupt: | ||
errored_identifiers.append(audio.identifier) | ||
return errored_identifiers | ||
except BaseException as err: | ||
errored_identifiers.append(audio.identifier) | ||
self.error(f"Unable to process {audio.identifier}: " f"{err}") | ||
progress.update(1) | ||
|
||
return errored_identifiers | ||
|
||
def handle(self, *args, **options): | ||
# These logs really muck up the tqdm output and don't give us much helpful | ||
# information, so they get silenced | ||
logging.getLogger("catalog.api.utils.waveform").setLevel(logging.WARNING) | ||
|
||
existing_waveform_audio_identifiers_query = AudioAddOn.objects.filter( | ||
waveform_peaks__isnull=False | ||
).values_list("audio_identifier", flat=True) | ||
audios = Audio.objects.exclude( | ||
identifier__in=existing_waveform_audio_identifiers_query | ||
).order_by("id") | ||
Comment on lines
+93
to
+95
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we taking this approach because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By "this approach" what do you mean? The inner select on the add ons table? The tables don't have FK relationship though, you're right, at least not one the DB knows about. Trying to guess what your question was about, I don't think there's a JOIN we could do here, just an inner select (though for clarity for anyone reading this who isn't familiar with Django ORM, the inner select will be executed in the DB as an actual inner-SELECT, not Python-side). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I meant that I could have sworn we could do something like
if these were foreign keys. They're not, so I don't think Django would be able to interpret that relationship appropriately. And sorry for the lack of context, I had that initially but second guessed myself and deleted the entire comment 🤦♀️ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ohh yes no that's not possible unfortunately because without the explicit relationship field Django won't add the reverse field relationships you'd need to make that JOIN. |
||
|
||
max_records = options["max_records"] | ||
count = audios.count() | ||
|
||
count_to_process = count | ||
|
||
if max_records is not None: | ||
count_to_process = max_records if max_records < count else count | ||
|
||
self.info( | ||
self.style.NOTICE(f"Generating waveforms for {count_to_process:,} records") | ||
) | ||
|
||
audio_handler = self.get_audio_handler(options) | ||
|
||
errored_identifiers = self._process_wavelengths( | ||
audios, audio_handler, count_to_process | ||
) | ||
|
||
self.info(self.style.SUCCESS("Finished generating waveforms!")) | ||
|
||
if errored_identifiers: | ||
errored_identifiers_joined = "\n".join( | ||
str(identifier) for identifier in errored_identifiers | ||
) | ||
|
||
self.info( | ||
self.style.WARNING( | ||
f"The following Audio identifiers were unable " | ||
f"to be processed\n\n{errored_identifiers_joined}" | ||
) | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from factory import Faker | ||
from faker.providers import BaseProvider | ||
from faker.utils.distribution import choices_distribution | ||
|
||
|
||
class WaveformProvider(BaseProvider): | ||
_float_space = [x / 100.0 for x in range(101)] * 20 | ||
|
||
@classmethod | ||
def generate_waveform(cls) -> list[float]: | ||
return choices_distribution(cls._float_space, p=None, length=1000) | ||
|
||
def waveform(self) -> list[float]: | ||
return WaveformProvider.generate_waveform() | ||
|
||
|
||
Faker.add_provider(WaveformProvider) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from test.factory.faker import Faker | ||
from test.factory.models.media import IdentifierFactory, MediaFactory | ||
|
||
from catalog.api.models.audio import Audio, AudioAddOn | ||
from factory.django import DjangoModelFactory | ||
|
||
|
||
class AudioFactory(MediaFactory): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Super neat! |
||
class Meta: | ||
model = Audio | ||
|
||
|
||
class AudioAddOnFactory(DjangoModelFactory): | ||
class Meta: | ||
model = AudioAddOn | ||
|
||
audio_identifier = IdentifierFactory(AudioFactory) | ||
|
||
waveform_peaks = Faker("waveform") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a nice addition.