diff --git a/api/Pipfile b/api/Pipfile index 06905e83a..b6e12223b 100644 --- a/api/Pipfile +++ b/api/Pipfile @@ -14,6 +14,7 @@ sphinx = "*" sphinx-autobuild = "*" furo = "*" myst-parser = "*" +factory-boy = "*" [packages] aws-requests-auth = "*" @@ -49,6 +50,8 @@ python-decouple = "*" django-cron = "*" gunicorn = "*" sentry-sdk = "*" +django-tqdm = "*" +limit = "*" [requires] python_version = "3.10" diff --git a/api/Pipfile.lock b/api/Pipfile.lock index b28299972..e67159134 100644 --- a/api/Pipfile.lock +++ b/api/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "45c1c4d7e09cf6bc77ba5f0120bfa8186c133d169c7b231801af4107fb90df8e" + "sha256": "0a42dcc7134e8d9f6064d5508f86ffbfbabbc1b7c77fbd1505f0f976c6bad0e9" }, "pipfile-spec": 6, "requires": { @@ -34,19 +34,19 @@ }, "boto3": { "hashes": [ - "sha256:25a76b7b530a124d9e526c62ff2b7da5782315195badb9a6273714898d689820", - "sha256:cc40566dec3f48611a82ace07b29489848e9bd35a51e3e992d1902a3c037e9fc" + "sha256:20d29e7b845eed3a39b43633443859bfa719ddbe3c8702ddfd999bc737fba9db", + "sha256:d674fcd10c9603984599157941db66aa212d217dd073581c0ba2eab5e5da84fa" ], "index": "pypi", - "version": "==1.21.0" + "version": "==1.21.14" }, "botocore": { "hashes": [ - "sha256:46e51f56f1c5784e4245e036503635fa71b722775657b6e1acf21ec5b906974c", - "sha256:7b166096f9413b41caf7cc6f4edfd5b3c3ab9d7c61eb120a649e69485c98131a" + "sha256:2c40f4fc3925b9057869beda1413a7b77edb7a28eb05c6265eaf6ca1ca7d3b63", + "sha256:9fe55f6eab0977b00afd90e770ef2c8b989fa7b18e5c14b22ba62216ec3b564a" ], "markers": "python_version >= '3.6'", - "version": "==1.24.13" + "version": "==1.24.14" }, "certifi": { "hashes": [ @@ -250,6 +250,14 @@ "index": "pypi", "version": "==1.12.3" }, + "django-tqdm": { + "hashes": [ + "sha256:39db30b64f1177e666cfb77e5769e572bbfe6de6bebcd382beaf8a7b4f335401", + "sha256:c7fa0aadbe512fb7bff83f1d800094313b044eed9ac0552e1c22da49d84b2121" + ], + "index": "pypi", + "version": "==1.0.0" + }, "django-uuslug": { "hashes": [ "sha256:047e713eeddecf11a674d4cd27ac72407f85ef13196856ba8dfeb4d691d521d4", @@ -483,6 +491,13 @@ ], "version": "==1.0" }, + "limit": { + "hashes": [ + "sha256:5dcb9d657a17fd4285cda417fb67dcef297bc6179e4c0d25f0a1eaab87ed30ba" + ], + "index": "pypi", + "version": "==0.2.3" + }, "markupsafe": { "hashes": [ "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", @@ -767,11 +782,11 @@ }, "sentry-sdk": { "hashes": [ - "sha256:1ab34e3851a34aeb3d1af1a0f77cec73978c4e9698e5210d050e4932953cb241", - "sha256:ac2a50128409d57655279817aedcb7800cace1f76b266f3dd62055d5afd6e098" + "sha256:411a8495bd18cf13038e5749e4710beb4efa53da6351f67b4c2f307c2d9b6d49", + "sha256:aa52da941c56b5a76fd838f8e9e92a850bf893a9eb1e33ffce6c21431d07ee30" ], "index": "pypi", - "version": "==1.5.6" + "version": "==1.5.7" }, "setuptools": { "hashes": [ @@ -804,6 +819,14 @@ ], "version": "==1.3" }, + "tqdm": { + "hashes": [ + "sha256:1d9835ede8e394bb8c9dcbffbca02d717217113adc679236873eeaac5bc0b3cd", + "sha256:e643e071046f17139dea55b880dc9b33822ce21613b4a4f5ea57f202833dbc29" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==4.63.0" + }, "uritemplate": { "hashes": [ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", @@ -1148,6 +1171,22 @@ ], "version": "==0.8.3" }, + "factory-boy": { + "hashes": [ + "sha256:a98d277b0c047c75eb6e4ab8508a7f81fb03d2cb21986f627913546ef7a2a55e", + "sha256:eb02a7dd1b577ef606b75a253b9818e6f9eaf996d94449c9d5ebb124f90dc795" + ], + "index": "pypi", + "version": "==3.2.1" + }, + "faker": { + "hashes": [ + "sha256:c88c8b5ee9376a242deca8fe829f9a3215ffa43c31da6f66d9594531fb344453", + "sha256:fa060e331ffffb57cfa4c07f95d54911e339984ed72596ba6a9e7b6fa569d799" + ], + "markers": "python_version >= '3.6'", + "version": "==13.3.1" + }, "filelock": { "hashes": [ "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85", @@ -1179,11 +1218,11 @@ }, "furo": { "hashes": [ - "sha256:005823b67f6ba00dde03e68dc78d437c72df7bcf816bcc6c111019c8faa7aa3d", - "sha256:bfed4e6a0511ab7baf211b3195b851914bf11a72e5907e073b1de9296de391f6" + "sha256:6c718293ebf87755f0b9f148b1e697c9e3aabd7af955644d4bcaee5ce75db781", + "sha256:7660267cc67b2828fd0e17bc07adeb612c47b2eba5a6de07049a1569e6044aa8" ], "index": "pypi", - "version": "==2022.2.23" + "version": "==2022.3.4" }, "gevent": { "hashes": [ @@ -1365,11 +1404,11 @@ }, "ipython": { "hashes": [ - "sha256:42c23e90b2deaae631266885de1656a517a1673d7e1db57e8eb3a4bb6cd5ce1b", - "sha256:7bfeb6f298b2d7f3859c4f3e134082015cf34de90f89f5020e107a5a762ef6db" + "sha256:6f56bfaeaa3247aa3b9cd3b8cbab3a9c0abf7428392f97b21902d12b2f42a381", + "sha256:8138762243c9b3a3ffcf70b37151a2a35c23d3a29f9743878c33624f4207be3d" ], "index": "pypi", - "version": "==8.1.0" + "version": "==8.1.1" }, "itsdangerous": { "hashes": [ @@ -1704,6 +1743,14 @@ "index": "pypi", "version": "==4.5.2" }, + "python-dateutil": { + "hashes": [ + "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", + "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.8.2" + }, "pytz": { "hashes": [ "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c", @@ -1987,7 +2034,7 @@ "sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68", "sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5" ], - "markers": "python_version >= '3.5'", + "markers": "python_version > '2.7'", "version": "==6.1" }, "traitlets": { @@ -2016,11 +2063,11 @@ }, "virtualenv": { "hashes": [ - "sha256:01f5f80744d24a3743ce61858123488e91cb2dd1d3bdf92adaf1bba39ffdedf0", - "sha256:e7b34c9474e6476ee208c43a4d9ac1510b041c68347eabfe9a9ea0c86aa0a46b" + "sha256:dd448d1ded9f14d1a4bfa6bfc0c5b96ae3be3f2d6c6c159b23ddcfd701baa021", + "sha256:e9dd1a1359d70137559034c0f5433b34caf504af2dc756367be86a5a32967134" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==20.13.2" + "version": "==20.13.3" }, "wcwidth": { "hashes": [ diff --git a/api/catalog/management/commands/generatewaveforms.py b/api/catalog/management/commands/generatewaveforms.py new file mode 100644 index 000000000..985164ca0 --- /dev/null +++ b/api/catalog/management/commands/generatewaveforms.py @@ -0,0 +1,127 @@ +import logging +import subprocess + +from catalog.api.models.audio import Audio, AudioAddOn +from django_tqdm import BaseCommand +from limit import limit + + +def paginate_reducing_query(get_query_set, page_size=10): + """ + We can't use `Paginator` because it can't handle the situation + where the query result changes each time a page is accessed. + Because the `audios` QuerySet result is naturally getting smaller + each time we successfully process waveforms, we can just take + the first ten for each "page" until the page comes back empty. + This should theoretically be faster/less DB latency inducing + anyway as we're never going to have huge OFFSET values to + access deep pages. + """ + page = list(get_query_set()[0:page_size]) + while len(page): + yield page + page = list(get_query_set()[0:page_size]) + + +class Command(BaseCommand): + help = "Generates waveforms for all audio records to populate the cache." + """ + Note: We rely on the file download and waveform generation times + taking long enough to prevent us from either making too many requests + to the upstream provider or inserting into our database too quickly and + causing a slow down. In local tests and in tests run on the staging server + it appeared to take on average around 6 to 8 seconds for each audio file. + That should be enough latency to not cause any problems. + """ + + def add_arguments(self, parser): + parser.add_argument( + "--no_rate_limit", help="Remove self impose rate limits for testing." + ) + parser.add_argument( + "--max_records", help="Limit the number of waveforms to create.", type=int + ) + + def get_audio_handler(self, options): + if options["no_rate_limit"]: + return lambda audio: audio.get_or_create_waveform() + + @limit(limit=1, every=2) # Call once per two seconds maximum + def limited(audio): + audio.get_or_create_waveform() + + return limited + + def _process_wavelengths(self, audios, audio_handler, count_to_process): + errored_identifiers = [] + processed = 0 + with self.tqdm(total=count_to_process) as progress: + paginator = paginate_reducing_query( + get_query_set=lambda: audios.exclude(identifier__in=errored_identifiers) + ) + for page in paginator: + for audio in page: + if processed > count_to_process: + return errored_identifiers + try: + processed += 1 + audio_handler(audio) + except subprocess.CalledProcessError as err: + errored_identifiers.append(audio.identifier) + self.error( + f"Unable to process {audio.identifier}: " + f"{err.stderr.decode().strip()}" + ) + except KeyboardInterrupt: + errored_identifiers.append(audio.identifier) + return errored_identifiers + except BaseException as err: + errored_identifiers.append(audio.identifier) + self.error(f"Unable to process {audio.identifier}: " f"{err}") + progress.update(1) + + return errored_identifiers + + def handle(self, *args, **options): + # These logs really muck up the tqdm output and don't give us much helpful + # information, so they get silenced + logging.getLogger("catalog.api.utils.waveform").setLevel(logging.WARNING) + + existing_waveform_audio_identifiers_query = AudioAddOn.objects.filter( + waveform_peaks__isnull=False + ).values_list("audio_identifier", flat=True) + audios = Audio.objects.exclude( + identifier__in=existing_waveform_audio_identifiers_query + ).order_by("id") + + max_records = options["max_records"] + count = audios.count() + + count_to_process = count + + if max_records is not None: + count_to_process = max_records if max_records < count else count + + self.info( + self.style.NOTICE(f"Generating waveforms for {count_to_process:,} records") + ) + + audio_handler = self.get_audio_handler(options) + + errored_identifiers = self._process_wavelengths( + audios, audio_handler, count_to_process + ) + + self.info(self.style.SUCCESS("Finished generating waveforms!")) + + if errored_identifiers: + errored_identifiers_joined = "\n".join( + str(identifier) for identifier in errored_identifiers + ) + + self.info( + self.style.WARNING( + f"The following Audio identifiers were unable " + f"to be processed\n\n{errored_identifiers_joined}" + ) + ) diff --git a/api/test/factory/__init__.py b/api/test/factory/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/test/factory/faker.py b/api/test/factory/faker.py new file mode 100644 index 000000000..d02ed6432 --- /dev/null +++ b/api/test/factory/faker.py @@ -0,0 +1,17 @@ +from factory import Faker +from faker.providers import BaseProvider +from faker.utils.distribution import choices_distribution + + +class WaveformProvider(BaseProvider): + _float_space = [x / 100.0 for x in range(101)] * 20 + + @classmethod + def generate_waveform(cls) -> list[float]: + return choices_distribution(cls._float_space, p=None, length=1000) + + def waveform(self) -> list[float]: + return WaveformProvider.generate_waveform() + + +Faker.add_provider(WaveformProvider) diff --git a/api/test/factory/models/__init__.py b/api/test/factory/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/test/factory/models/audio.py b/api/test/factory/models/audio.py new file mode 100644 index 000000000..db0221305 --- /dev/null +++ b/api/test/factory/models/audio.py @@ -0,0 +1,19 @@ +from test.factory.faker import Faker +from test.factory.models.media import IdentifierFactory, MediaFactory + +from catalog.api.models.audio import Audio, AudioAddOn +from factory.django import DjangoModelFactory + + +class AudioFactory(MediaFactory): + class Meta: + model = Audio + + +class AudioAddOnFactory(DjangoModelFactory): + class Meta: + model = AudioAddOn + + audio_identifier = IdentifierFactory(AudioFactory) + + waveform_peaks = Faker("waveform") diff --git a/api/test/factory/models/media.py b/api/test/factory/models/media.py new file mode 100644 index 000000000..196a055f6 --- /dev/null +++ b/api/test/factory/models/media.py @@ -0,0 +1,38 @@ +from test.factory.faker import Faker +from uuid import uuid4 + +import factory +from catalog.api.licenses import LICENSES +from factory.django import DjangoModelFactory + + +class MediaFactory(DjangoModelFactory): + """Base factory for models that extend from the AbstractMedia class.""" + + class Meta: + abstract = True + + identifier = factory.sequence(lambda _: uuid4()) + + foreign_identifier = factory.sequence(lambda _: uuid4()) + """The foreign identifier isn't necessarily a UUID but for test purposes it's fine if it looks like one""" + + license = Faker( + "random_element", elements=[the_license[0] for the_license in LICENSES] + ) + + foreign_landing_url = Faker("url") + + +class IdentifierFactory(factory.SubFactory): + """ + A factory for creating a related model and returning the UUID. + + Distinct from the `SubFactory` in that this creates the related model but + uses a specific attribute from it for the resulting value instead of the + related model itself. + """ + + def evaluate(self, instance, step, extra): + model = super().evaluate(instance, step, extra) + return model.identifier diff --git a/api/test/run_test.sh b/api/test/run_test.sh index 2d66ac9bb..706beeb86 100755 --- a/api/test/run_test.sh +++ b/api/test/run_test.sh @@ -18,8 +18,8 @@ succeeded=$? if [[ $succeeded -eq 0 ]]; then printf "${green}:-) All tests passed${endcol}\n" else - printf "Full system logs:\n" - docker-compose logs + printf "Full system logs:\n" + docker-compose logs printf "${red}:'( Some tests did not pass${endcol}\n" fi exit $succeeded diff --git a/api/test/unit/management/__init__.py b/api/test/unit/management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/test/unit/management/commands/__init__.py b/api/test/unit/management/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/test/unit/management/commands/generatewaveforms_test.py b/api/test/unit/management/commands/generatewaveforms_test.py new file mode 100644 index 000000000..df8030a2f --- /dev/null +++ b/api/test/unit/management/commands/generatewaveforms_test.py @@ -0,0 +1,180 @@ +import subprocess +from io import StringIO +from test.factory.faker import WaveformProvider +from test.factory.models.audio import AudioAddOnFactory, AudioFactory +from unittest import mock + +import psycopg2 +import pytest +from catalog.api.models.audio import Audio, AudioAddOn +from django.core.management import call_command +from django.db import connections +from django.test.utils import CaptureQueriesContext + + +@mock.patch("catalog.api.models.audio.generate_peaks") +def call_generatewaveforms(mock_generate_peaks: mock.MagicMock) -> tuple[str, str]: + mock_generate_peaks.side_effect = lambda _: WaveformProvider.generate_waveform() + out = StringIO() + err = StringIO() + call_command("generatewaveforms", no_rate_limit=True, stdout=out, stderr=err) + + return out.getvalue(), err.getvalue() + + +def assert_all_audio_have_waveforms(): + assert ( + list( + AudioAddOn.objects.filter(waveform_peaks__isnull=False).values_list( + "audio_identifier" + ) + ).sort() + == list(Audio.objects.all().values_list("identifier")).sort() + ) + + +@pytest.mark.django_db +def test_creates_waveforms_for_audio(): + AudioFactory.create_batch(153) + + assert AudioAddOn.objects.count() == 0 + + call_generatewaveforms() + + assert_all_audio_have_waveforms() + + +@pytest.mark.django_db +def test_does_not_reprocess_existing_waveforms(): + waveformless_audio = AudioFactory.create_batch(3) + + # AudioAddOnFactory will create associated Audio objects as well + # so those three will serve as the audio that should _not_ get processed + AudioAddOnFactory.create_batch(3) + + # Create an add on that doesn't have a waveform, this one should get processed as well + null_waveform_addon = AudioAddOnFactory.create(waveform_peaks=None) + waveformless_audio.append( + Audio.objects.get(identifier=null_waveform_addon.audio_identifier) + ) + + out, err = call_generatewaveforms() + + assert f"Generating waveforms for {len(waveformless_audio)} records" in out + assert_all_audio_have_waveforms() + + +@pytest.mark.django_db +@mock.patch("catalog.api.models.audio.generate_peaks") +def test_paginates_audio_waveforms_to_generate( + mock_generate_peaks, django_assert_num_queries +): + mock_generate_peaks.return_value = WaveformProvider.generate_waveform() + + audio_count = 53 # 6 pages + pages = 6 + AudioFactory.create_batch(audio_count) + + test_audio = AudioFactory.create() + with CaptureQueriesContext(connections["default"]) as capture: + test_audio.get_or_create_waveform() + test_audio.delete() + + queries_per_iteration = len(capture.captured_queries) + + # 1 per page + the final empty page's query + pagination_queries = pages + 1 + + # initializes the count for tqdm + count_queries = 1 + + # queries inside get_or_create_waveform + interation_queries = queries_per_iteration * audio_count + + expected_queries = interation_queries + pagination_queries + count_queries + + with django_assert_num_queries(expected_queries): + call_generatewaveforms() + + assert_all_audio_have_waveforms() + + +@pytest.mark.django_db +@pytest.mark.parametrize( + ("exception_class", "exception_args", "exception_kwargs"), + ( + ( + subprocess.CalledProcessError, + (1, "audiowaveform"), + {"stderr": b"This is an error string"}, + ), + ( + psycopg2.errors.lookup(psycopg2.errorcodes.NOT_NULL_VIOLATION), + tuple(), + dict(), + ), + ), +) +@mock.patch("catalog.api.models.audio.generate_peaks") +def test_logs_and_continues_if_waveform_generation_fails( + mock_generate_peaks, exception_class, exception_args, exception_kwargs +): + audio_count = 23 + return_values = [ + exception_class(*exception_args, **exception_kwargs) + if i == 9 + else WaveformProvider.generate_waveform() + for i in range(audio_count) + ] + mock_generate_peaks.side_effect = return_values + AudioFactory.create_batch(audio_count) + + out = StringIO() + err = StringIO() + call_command("generatewaveforms", no_rate_limit=True, stdout=out, stderr=err) + + failed_audio = Audio.objects.exclude( + identifier__in=AudioAddOn.objects.filter( + waveform_peaks__isnull=False + ).values_list("audio_identifier", flat=True) + ) + + assert failed_audio.count() == 1 + assert f"Unable to process {failed_audio.first().identifier}" in err.getvalue() + + assert ( + AudioAddOn.objects.filter(waveform_peaks__isnull=False).count() + == audio_count - 1 + ) + + +@pytest.mark.django_db +@mock.patch("catalog.api.models.audio.generate_peaks") +def test_keyboard_interrupt_should_halt_processing(mock_generate_peaks): + audio_count = 23 + interrupt_at = 9 + return_values = [ + KeyboardInterrupt() + if i == interrupt_at + else WaveformProvider.generate_waveform() + for i in range(audio_count) + ] + + mock_generate_peaks.side_effect = return_values + AudioFactory.create_batch(audio_count) + + out = StringIO() + err = StringIO() + call_command("generatewaveforms", no_rate_limit=True, stdout=out, stderr=err) + + failed_audio = Audio.objects.exclude( + identifier__in=AudioAddOn.objects.filter( + waveform_peaks__isnull=False + ).values_list("audio_identifier", flat=True) + ) + + assert failed_audio.count() == audio_count - interrupt_at + + assert ( + AudioAddOn.objects.filter(waveform_peaks__isnull=False).count() == interrupt_at + ) diff --git a/api/test/unit/models/audio_test.py b/api/test/unit/models/audio_test.py index 1f73b71e6..f6009a988 100644 --- a/api/test/unit/models/audio_test.py +++ b/api/test/unit/models/audio_test.py @@ -1,4 +1,5 @@ import uuid +from test.factory.faker import WaveformProvider from unittest import mock import pytest @@ -20,7 +21,7 @@ def audio_fixture(): @pytest.mark.django_db @mock.patch("catalog.api.models.audio.generate_peaks") def test_audio_waveform_caches(generate_peaks_mock, audio_fixture): - mock_waveform = [0.4, 0.3, 0.1, 0, 1, 0.6] + mock_waveform = WaveformProvider.generate_waveform() generate_peaks_mock.return_value = mock_waveform assert AudioAddOn.objects.count() == 0