diff --git a/api/application/services/data_service.py b/api/application/services/data_service.py index 61134225..e3061e32 100644 --- a/api/application/services/data_service.py +++ b/api/application/services/data_service.py @@ -90,6 +90,7 @@ def generate_permanent_filename(self, raw_file_identifier: str) -> str: def upload_dataset( self, subject_id: str, + job_id: str, domain: str, dataset: str, version: Optional[int], @@ -105,6 +106,7 @@ def upload_dataset( raw_file_identifier = self.generate_raw_file_identifier() upload_job = self.job_service.create_upload_job( subject_id, + job_id, file_path.name, raw_file_identifier, domain, diff --git a/api/application/services/job_service.py b/api/application/services/job_service.py index 8d2dea24..d76ccd4d 100644 --- a/api/application/services/job_service.py +++ b/api/application/services/job_service.py @@ -57,6 +57,7 @@ def get_job(self, job_id: str) -> Dict: def create_upload_job( self, subject_id: str, + job_id: str, filename: str, raw_file_identifier: str, domain: str, @@ -64,7 +65,7 @@ def create_upload_job( version: int, ) -> UploadJob: job = UploadJob( - subject_id, filename, raw_file_identifier, domain, dataset, version + subject_id, job_id, filename, raw_file_identifier, domain, dataset, version ) self.db_adapter.store_upload_job(job) return job diff --git a/api/controller/datasets.py b/api/controller/datasets.py index e76f9b1f..41cc10a5 100644 --- a/api/controller/datasets.py +++ b/api/controller/datasets.py @@ -38,6 +38,8 @@ from api.domain.metadata_search import metadata_search_query from api.domain.mime_type import MimeType from api.domain.sql_query import SQLQuery +from api.domain.Jobs.Job import generate_uuid + CATALOG_DISABLED = strtobool(os.environ.get("CATALOG_DISABLED", "False")) @@ -363,9 +365,10 @@ def upload_data( """ try: subject_id = get_subject_id(request) - incoming_file_path = store_file_to_disk(file) + job_id = generate_uuid() + incoming_file_path = store_file_to_disk(job_id, file) raw_filename, version, job_id = data_service.upload_dataset( - subject_id, domain, dataset, version, incoming_file_path + subject_id, job_id, domain, dataset, version, incoming_file_path ) response.status_code = http_status.HTTP_202_ACCEPTED return { @@ -382,8 +385,8 @@ def upload_data( raise UserError(message=error.args[0]) -def store_file_to_disk(file: UploadFile = File(...)) -> Path: - file_path = Path(file.filename) +def store_file_to_disk(id: str, file: UploadFile = File(...)) -> Path: + file_path = Path(f"{id}-{file.filename}") chunk_size_mb = 50 mb_1 = 1024 * 1024 diff --git a/api/domain/Jobs/Job.py b/api/domain/Jobs/Job.py index b6f88717..5426f0da 100644 --- a/api/domain/Jobs/Job.py +++ b/api/domain/Jobs/Job.py @@ -1,6 +1,6 @@ import time import uuid -from typing import Set +from typing import Optional, Set from api.common.config.constants import DEFAULT_JOB_EXPIRY_DAYS from api.common.utilities import BaseEnum @@ -26,12 +26,18 @@ def generate_uuid() -> str: class Job: - def __init__(self, job_type: JobType, step: JobStep, subject_id: str): + def __init__( + self, + job_type: JobType, + step: JobStep, + subject_id: str, + job_id: Optional[str] = None, + ): self.step: JobStep = step self.job_type: JobType = job_type self.subject_id: str = subject_id self.status: JobStatus = JobStatus.IN_PROGRESS - self.job_id: str = generate_uuid() + self.job_id: str = job_id if job_id else generate_uuid() self.errors: Set[str] = set() self.expiry_time: int = int( time.time() + DEFAULT_JOB_EXPIRY_DAYS * 24 * 60 * 60 diff --git a/api/domain/Jobs/UploadJob.py b/api/domain/Jobs/UploadJob.py index e8e7d37e..f2afb9a6 100644 --- a/api/domain/Jobs/UploadJob.py +++ b/api/domain/Jobs/UploadJob.py @@ -17,13 +17,14 @@ class UploadJob(Job): def __init__( self, subject_id: str, + job_id: str, filename: str, raw_file_identifier: str, domain: str, dataset: str, version: int, ): - super().__init__(JobType.UPLOAD, UploadStep.INITIALISATION, subject_id) + super().__init__(JobType.UPLOAD, UploadStep.INITIALISATION, subject_id, job_id) self.filename: str = filename self.raw_file_identifier: str = raw_file_identifier self.domain: str = domain diff --git a/changelog.md b/changelog.md index 14a573bc..cc5f6598 100644 --- a/changelog.md +++ b/changelog.md @@ -11,6 +11,7 @@ See [v6.0.0] changes - All required calls in the API are now paginated by Boto3. This fixes some large issues where, when there were more than 50 crawlers in the account the API would fail to retrieve all datasets as the backend call would paginate onto a next page. - Fixes an issue where the delete data file endpoint was deleting the raw data file from S3 and now instead deletes the processed file instead. +- Fixes an issue where the uploaded files were temporarily stored with just the name they were uploaded with, this was causing errors if two identically names files were uploaded within a small window. ### Added diff --git a/test/api/adapter/test_dynamodb_adapter.py b/test/api/adapter/test_dynamodb_adapter.py index 493daa20..35d5d653 100644 --- a/test/api/adapter/test_dynamodb_adapter.py +++ b/test/api/adapter/test_dynamodb_adapter.py @@ -561,15 +561,19 @@ def setup_method(self): self.test_service_table_name = "TEST SERVICE TABLE" self.dynamo_adapter = DynamoDBAdapter(self.dynamo_data_source) - @patch("api.domain.Jobs.Job.uuid") @patch("api.domain.Jobs.UploadJob.time") - def test_store_async_upload_job(self, mock_time, mock_uuid): + def test_store_async_upload_job(self, mock_time): mock_time.time.return_value = 1000 - mock_uuid.uuid4.return_value = "abc-123" self.dynamo_adapter.store_upload_job( UploadJob( - "subject-123", "filename.csv", "111-222-333", "domain1", "dataset2", 4 + "subject-123", + "abc-123", + "filename.csv", + "111-222-333", + "domain1", + "dataset2", + 4, ) ) @@ -771,12 +775,16 @@ def test_raises_error_when_get_jobs_fails(self): ): self.dynamo_adapter.get_jobs("subject-123") - @patch("api.domain.Jobs.Job.uuid") - def test_update_job(self, mock_uuid): - mock_uuid.uuid4.return_value = "abc-123" + def test_update_job(self): job = UploadJob( - "subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4 + "subject-123", + "abc-123", + "file1.csv", + "111-222-333", + "domain1", + "dataset2", + 4, ) job.set_step(UploadStep.VALIDATION) job.set_status(JobStatus.FAILED) @@ -804,12 +812,16 @@ def test_update_job(self, mock_uuid): }, ) - @patch("api.domain.Jobs.Job.uuid") - def test_update_job_without_errors(self, mock_uuid): - mock_uuid.uuid4.return_value = "abc-123" + def test_update_job_without_errors(self): job = UploadJob( - "subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4 + "subject-123", + "abc-123", + "file1.csv", + "111-222-333", + "domain1", + "dataset2", + 4, ) job.set_step(UploadStep.VALIDATION) job.set_status(JobStatus.FAILED) @@ -836,12 +848,16 @@ def test_update_job_without_errors(self, mock_uuid): }, ) - @patch("api.domain.Jobs.Job.uuid") - def test_update_job_raises_error_when_fails(self, mock_uuid): - mock_uuid.uuid4.return_value = "abc-123" + def test_update_job_raises_error_when_fails(self): job = UploadJob( - "subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4 + "subject-123", + "abc-123", + "file1.csv", + "111-222-333", + "domain1", + "dataset2", + 4, ) self.service_table.update_item.side_effect = ClientError( diff --git a/test/api/application/services/test_data_service.py b/test/api/application/services/test_data_service.py index 94bcaba2..0c02abe4 100644 --- a/test/api/application/services/test_data_service.py +++ b/test/api/application/services/test_data_service.py @@ -552,7 +552,7 @@ def test_raises_error_when_schema_does_not_exist(self, mock_get_version): with pytest.raises(SchemaNotFoundError): self.data_service.upload_dataset( - "subject-123", "some", "other", None, Path("data.csv") + "subject-123", "234", "some", "other", None, Path("data.csv") ) self.s3_adapter.find_schema.assert_called_once_with("some", "other", 1) @@ -591,12 +591,12 @@ def test_upload_dataset_triggers_process_upload_and_returns_expected_data( # WHEN uploaded_raw_file = self.data_service.upload_dataset( - "subject-123", "some", "other", None, Path("data.csv") + "subject-123", "abc-123", "some", "other", None, Path("data.csv") ) # THEN self.job_service.create_upload_job.assert_called_once_with( - "subject-123", "data.csv", "123-456-789", "some", "other", 1 + "subject-123", "abc-123", "data.csv", "123-456-789", "some", "other", 1 ) self.data_service.generate_raw_file_identifier.assert_called_once() mock_thread.assert_called_once_with( @@ -825,7 +825,7 @@ def test_upload_dataset_in_chunks_with_invalid_data( try: self.data_service.upload_dataset( - "subject-123", "some", "other", 2, Path("data.csv") + "subject-123", "abc-123", "some", "other", 2, Path("data.csv") ) except DatasetValidationError as error: assert { @@ -860,7 +860,7 @@ def test_upload_dataset_in_chunks_with_invalid_data_in_multiple_chunks( try: self.data_service.upload_dataset( - "subject-123", "some", "other", 2, Path("data.csv") + "subject-123", "abc-123", "some", "other", 2, Path("data.csv") ) except DatasetValidationError as error: assert { diff --git a/test/api/application/services/test_job_service.py b/test/api/application/services/test_job_service.py index 160e8822..11464317 100644 --- a/test/api/application/services/test_job_service.py +++ b/test/api/application/services/test_job_service.py @@ -383,11 +383,10 @@ class TestCreateUploadJob: def setup(self): self.job_service = JobService() - @patch("api.domain.Jobs.Job.uuid") @patch.object(DynamoDBAdapter, "store_upload_job") - def test_creates_upload_job(self, mock_store_upload_job, mock_uuid): + def test_creates_upload_job(self, mock_store_upload_job): # GIVEN - mock_uuid.uuid4.return_value = "abc-123" + job_id = "abc-123" subject_id = "subject-123" filename = "file1.csv" raw_file_identifier = "123-456-789" @@ -397,11 +396,11 @@ def test_creates_upload_job(self, mock_store_upload_job, mock_uuid): # WHEN result = self.job_service.create_upload_job( - subject_id, filename, raw_file_identifier, domain, dataset, version + subject_id, job_id, filename, raw_file_identifier, domain, dataset, version ) # THEN - assert result.job_id == "abc-123" + assert result.job_id == job_id assert result.subject_id == subject_id assert result.filename == filename assert result.step == UploadStep.INITIALISATION @@ -438,13 +437,17 @@ class TestUpdateJob: def setup(self): self.job_service = JobService() - @patch("api.domain.Jobs.Job.uuid") @patch.object(DynamoDBAdapter, "update_job") - def test_updates_job(self, mock_update_job, mock_uuid): + def test_updates_job(self, mock_update_job): # GIVEN - mock_uuid.uuid4.return_value = "abc-123" job = UploadJob( - "subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4 + "subject-123", + "abc-123", + "file1.csv", + "111-222-333", + "domain1", + "dataset2", + 4, ) assert job.step == UploadStep.INITIALISATION @@ -477,13 +480,17 @@ class TestSucceedsJob: def setup(self): self.job_service = JobService() - @patch("api.domain.Jobs.Job.uuid") @patch.object(DynamoDBAdapter, "update_job") - def test_updates_job(self, mock_update_job, mock_uuid): + def test_updates_job(self, mock_update_job): # GIVEN - mock_uuid.uuid4.return_value = "abc-123" job = UploadJob( - "subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4 + "subject-123", + "abc-123", + "file1.csv", + "111-222-333", + "domain1", + "dataset2", + 4, ) assert job.status == JobStatus.IN_PROGRESS @@ -525,13 +532,17 @@ class TestFailsJob: def setup(self): self.job_service = JobService() - @patch("api.domain.Jobs.Job.uuid") @patch.object(DynamoDBAdapter, "update_job") - def test_updates_job(self, mock_update_job, mock_uuid): + def test_updates_job(self, mock_update_job): # GIVEN - mock_uuid.uuid4.return_value = "abc-123" job = UploadJob( - "subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4 + "subject-123", + "abc-123", + "file1.csv", + "111-222-333", + "domain1", + "dataset2", + 4, ) assert job.status == JobStatus.IN_PROGRESS diff --git a/test/api/controller/test_datasets.py b/test/api/controller/test_datasets.py index 077d369b..759f42d3 100644 --- a/test/api/controller/test_datasets.py +++ b/test/api/controller/test_datasets.py @@ -29,15 +29,22 @@ class TestDataUpload(BaseClientTest): @patch.object(DataService, "upload_dataset") @patch("api.controller.datasets.store_file_to_disk") @patch("api.controller.datasets.get_subject_id") + @patch("api.controller.datasets.generate_uuid") def test_calls_data_upload_service_successfully( - self, mock_get_subject_id, mock_store_file_to_disk, mock_upload_dataset + self, + mock_generate_uuid, + mock_get_subject_id, + mock_store_file_to_disk, + mock_upload_dataset, ): file_content = b"some,content" incoming_file_path = Path("filename.csv") incoming_file_name = "filename.csv" raw_file_identifier = "123-456-789" subject_id = "subject_id" + job_id = "abc-123" + mock_generate_uuid.return_value = job_id mock_get_subject_id.return_value = subject_id mock_store_file_to_disk.return_value = incoming_file_path mock_upload_dataset.return_value = f"{raw_file_identifier}.csv", 5, "abc-123" @@ -48,9 +55,9 @@ def test_calls_data_upload_service_successfully( headers={"Authorization": "Bearer test-token"}, ) - mock_store_file_to_disk.assert_called_once_with(ANY) + mock_store_file_to_disk.assert_called_once_with(job_id, ANY) mock_upload_dataset.assert_called_once_with( - subject_id, "domain", "dataset", None, incoming_file_path + subject_id, job_id, "domain", "dataset", None, incoming_file_path ) assert response.status_code == 202 @@ -67,15 +74,22 @@ def test_calls_data_upload_service_successfully( @patch.object(DataService, "upload_dataset") @patch("api.controller.datasets.store_file_to_disk") @patch("api.controller.datasets.get_subject_id") + @patch("api.controller.datasets.generate_uuid") def test_calls_data_upload_service_with_version_successfully( - self, mock_get_subject_id, mock_store_file_to_disk, mock_upload_dataset + self, + mock_generate_uuid, + mock_get_subject_id, + mock_store_file_to_disk, + mock_upload_dataset, ): + job_id = "abc-123" file_content = b"some,content" incoming_file_path = Path("filename.csv") incoming_file_name = "filename.csv" raw_file_identifier = "123-456-789" subject_id = "subject_id" + mock_generate_uuid.return_value = job_id mock_get_subject_id.return_value = subject_id mock_store_file_to_disk.return_value = incoming_file_path mock_upload_dataset.return_value = f"{raw_file_identifier}.csv", 2, "abc-123" @@ -86,9 +100,9 @@ def test_calls_data_upload_service_with_version_successfully( headers={"Authorization": "Bearer test-token"}, ) - mock_store_file_to_disk.assert_called_once_with(ANY) + mock_store_file_to_disk.assert_called_once_with(job_id, ANY) mock_upload_dataset.assert_called_once_with( - subject_id, "domain", "dataset", 2, incoming_file_path + subject_id, job_id, "domain", "dataset", 2, incoming_file_path ) assert response.status_code == 202 @@ -120,14 +134,21 @@ def test_calls_data_upload_service_fails_when_domain_uppercase(self): @patch.object(DataService, "upload_dataset") @patch("api.controller.datasets.store_file_to_disk") @patch("api.controller.datasets.get_subject_id") + @patch("api.controller.datasets.generate_uuid") def test_calls_data_upload_service_fails_when_invalid_dataset_is_uploaded( - self, mock_get_subject_id, mock_store_file_to_disk, mock_upload_dataset + self, + mock_generate_uuid, + mock_get_subject_id, + mock_store_file_to_disk, + mock_upload_dataset, ): + job_id = "job_id" file_content = b"some,content" incoming_file_path = Path("filename.csv") incoming_file_name = "filename.csv" subject_id = "subject_id" + mock_generate_uuid.return_value = job_id mock_get_subject_id.return_value = subject_id mock_store_file_to_disk.return_value = incoming_file_path mock_upload_dataset.side_effect = DatasetValidationError( @@ -141,7 +162,7 @@ def test_calls_data_upload_service_fails_when_invalid_dataset_is_uploaded( ) mock_upload_dataset.assert_called_once_with( - subject_id, "domain", "dataset", None, incoming_file_path + subject_id, job_id, "domain", "dataset", None, incoming_file_path ) assert response.status_code == 400 @@ -192,14 +213,21 @@ def test_raises_error_when_schema_does_not_exist( @patch.object(DataService, "upload_dataset") @patch("api.controller.datasets.store_file_to_disk") @patch("api.controller.datasets.get_subject_id") + @patch("api.controller.datasets.generate_uuid") def test_raises_error_when_crawler_is_already_running( - self, mock_get_subject_id, mock_store_file_to_disk, mock_upload_dataset + self, + mock_generate_uuid, + mock_get_subject_id, + mock_store_file_to_disk, + mock_upload_dataset, ): + job_id = "job_id" file_content = b"some,content" incoming_file_path = Path("filename.csv") incoming_file_name = "filename.csv" subject_id = "subject_id" + mock_generate_uuid.return_value = job_id mock_get_subject_id.return_value = subject_id mock_store_file_to_disk.return_value = incoming_file_path mock_upload_dataset.side_effect = CrawlerIsNotReadyError("Some message") @@ -211,7 +239,7 @@ def test_raises_error_when_crawler_is_already_running( ) mock_upload_dataset.assert_called_once_with( - subject_id, "domain", "dataset", None, incoming_file_path + subject_id, job_id, "domain", "dataset", None, incoming_file_path ) assert response.status_code == 429 @@ -220,14 +248,21 @@ def test_raises_error_when_crawler_is_already_running( @patch.object(DataService, "upload_dataset") @patch("api.controller.datasets.store_file_to_disk") @patch("api.controller.datasets.get_subject_id") + @patch("api.controller.datasets.generate_uuid") def test_raises_error_when_fails_to_get_crawler_state( - self, mock_get_subject_id, mock_store_file_to_disk, mock_upload_dataset + self, + mock_generate_uuid, + mock_get_subject_id, + mock_store_file_to_disk, + mock_upload_dataset, ): + job_id = "job_id" file_content = b"some,content" incoming_file_path = Path("filename.csv") incoming_file_name = "filename.csv" subject_id = "subject_id" + mock_generate_uuid.return_value = job_id mock_get_subject_id.return_value = subject_id mock_store_file_to_disk.return_value = incoming_file_path mock_upload_dataset.side_effect = AWSServiceError("Some message") @@ -239,7 +274,7 @@ def test_raises_error_when_fails_to_get_crawler_state( ) mock_upload_dataset.assert_called_once_with( - subject_id, "domain", "dataset", 3, incoming_file_path + subject_id, job_id, "domain", "dataset", 3, incoming_file_path ) assert response.status_code == 500 diff --git a/test/api/domain/Jobs/test_upload_job.py b/test/api/domain/Jobs/test_upload_job.py index 2e963cf7..afc7629c 100644 --- a/test/api/domain/Jobs/test_upload_job.py +++ b/test/api/domain/Jobs/test_upload_job.py @@ -4,14 +4,18 @@ from api.domain.Jobs.UploadJob import UploadJob, UploadStep -@patch("api.domain.Jobs.Job.uuid") @patch("api.domain.Jobs.UploadJob.time") -def test_initialise_upload_job(mock_time, mock_uuid): +def test_initialise_upload_job(mock_time): mock_time.time.return_value = 1000 - mock_uuid.uuid4.return_value = "abc-123" job = UploadJob( - "subject-123", "some-filename.csv", "111-222-333", "domain1", "dataset2", 12 + "subject-123", + "abc-123", + "some-filename.csv", + "111-222-333", + "domain1", + "dataset2", + 12, ) assert job.job_id == "abc-123"