Skip to content

Commit

Permalink
Merge pull request #57 from no10ds/bugfix/temp_file_names
Browse files Browse the repository at this point in the history
Bugfix/temp file names
  • Loading branch information
lcardno10 authored Mar 21, 2023
2 parents 373ebd1 + 68d606b commit d119a28
Show file tree
Hide file tree
Showing 11 changed files with 143 additions and 63 deletions.
2 changes: 2 additions & 0 deletions api/application/services/data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def generate_permanent_filename(self, raw_file_identifier: str) -> str:
def upload_dataset(
self,
subject_id: str,
job_id: str,
domain: str,
dataset: str,
version: Optional[int],
Expand All @@ -105,6 +106,7 @@ def upload_dataset(
raw_file_identifier = self.generate_raw_file_identifier()
upload_job = self.job_service.create_upload_job(
subject_id,
job_id,
file_path.name,
raw_file_identifier,
domain,
Expand Down
3 changes: 2 additions & 1 deletion api/application/services/job_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,15 @@ def get_job(self, job_id: str) -> Dict:
def create_upload_job(
self,
subject_id: str,
job_id: str,
filename: str,
raw_file_identifier: str,
domain: str,
dataset: str,
version: int,
) -> UploadJob:
job = UploadJob(
subject_id, filename, raw_file_identifier, domain, dataset, version
subject_id, job_id, filename, raw_file_identifier, domain, dataset, version
)
self.db_adapter.store_upload_job(job)
return job
Expand Down
11 changes: 7 additions & 4 deletions api/controller/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
from api.domain.metadata_search import metadata_search_query
from api.domain.mime_type import MimeType
from api.domain.sql_query import SQLQuery
from api.domain.Jobs.Job import generate_uuid


CATALOG_DISABLED = strtobool(os.environ.get("CATALOG_DISABLED", "False"))

Expand Down Expand Up @@ -363,9 +365,10 @@ def upload_data(
"""
try:
subject_id = get_subject_id(request)
incoming_file_path = store_file_to_disk(file)
job_id = generate_uuid()
incoming_file_path = store_file_to_disk(job_id, file)
raw_filename, version, job_id = data_service.upload_dataset(
subject_id, domain, dataset, version, incoming_file_path
subject_id, job_id, domain, dataset, version, incoming_file_path
)
response.status_code = http_status.HTTP_202_ACCEPTED
return {
Expand All @@ -382,8 +385,8 @@ def upload_data(
raise UserError(message=error.args[0])


def store_file_to_disk(file: UploadFile = File(...)) -> Path:
file_path = Path(file.filename)
def store_file_to_disk(id: str, file: UploadFile = File(...)) -> Path:
file_path = Path(f"{id}-{file.filename}")
chunk_size_mb = 50
mb_1 = 1024 * 1024

Expand Down
12 changes: 9 additions & 3 deletions api/domain/Jobs/Job.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import time
import uuid
from typing import Set
from typing import Optional, Set

from api.common.config.constants import DEFAULT_JOB_EXPIRY_DAYS
from api.common.utilities import BaseEnum
Expand All @@ -26,12 +26,18 @@ def generate_uuid() -> str:


class Job:
def __init__(self, job_type: JobType, step: JobStep, subject_id: str):
def __init__(
self,
job_type: JobType,
step: JobStep,
subject_id: str,
job_id: Optional[str] = None,
):
self.step: JobStep = step
self.job_type: JobType = job_type
self.subject_id: str = subject_id
self.status: JobStatus = JobStatus.IN_PROGRESS
self.job_id: str = generate_uuid()
self.job_id: str = job_id if job_id else generate_uuid()
self.errors: Set[str] = set()
self.expiry_time: int = int(
time.time() + DEFAULT_JOB_EXPIRY_DAYS * 24 * 60 * 60
Expand Down
3 changes: 2 additions & 1 deletion api/domain/Jobs/UploadJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ class UploadJob(Job):
def __init__(
self,
subject_id: str,
job_id: str,
filename: str,
raw_file_identifier: str,
domain: str,
dataset: str,
version: int,
):
super().__init__(JobType.UPLOAD, UploadStep.INITIALISATION, subject_id)
super().__init__(JobType.UPLOAD, UploadStep.INITIALISATION, subject_id, job_id)
self.filename: str = filename
self.raw_file_identifier: str = raw_file_identifier
self.domain: str = domain
Expand Down
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ See [v6.0.0] changes

- All required calls in the API are now paginated by Boto3. This fixes some large issues where, when there were more than 50 crawlers in the account the API would fail to retrieve all datasets as the backend call would paginate onto a next page.
- Fixes an issue where the delete data file endpoint was deleting the raw data file from S3 and now instead deletes the processed file instead.
- Fixes an issue where the uploaded files were temporarily stored with just the name they were uploaded with, this was causing errors if two identically names files were uploaded within a small window.

### Added

Expand Down
48 changes: 32 additions & 16 deletions test/api/adapter/test_dynamodb_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,15 +561,19 @@ def setup_method(self):
self.test_service_table_name = "TEST SERVICE TABLE"
self.dynamo_adapter = DynamoDBAdapter(self.dynamo_data_source)

@patch("api.domain.Jobs.Job.uuid")
@patch("api.domain.Jobs.UploadJob.time")
def test_store_async_upload_job(self, mock_time, mock_uuid):
def test_store_async_upload_job(self, mock_time):
mock_time.time.return_value = 1000
mock_uuid.uuid4.return_value = "abc-123"

self.dynamo_adapter.store_upload_job(
UploadJob(
"subject-123", "filename.csv", "111-222-333", "domain1", "dataset2", 4
"subject-123",
"abc-123",
"filename.csv",
"111-222-333",
"domain1",
"dataset2",
4,
)
)

Expand Down Expand Up @@ -771,12 +775,16 @@ def test_raises_error_when_get_jobs_fails(self):
):
self.dynamo_adapter.get_jobs("subject-123")

@patch("api.domain.Jobs.Job.uuid")
def test_update_job(self, mock_uuid):
mock_uuid.uuid4.return_value = "abc-123"
def test_update_job(self):

job = UploadJob(
"subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4
"subject-123",
"abc-123",
"file1.csv",
"111-222-333",
"domain1",
"dataset2",
4,
)
job.set_step(UploadStep.VALIDATION)
job.set_status(JobStatus.FAILED)
Expand Down Expand Up @@ -804,12 +812,16 @@ def test_update_job(self, mock_uuid):
},
)

@patch("api.domain.Jobs.Job.uuid")
def test_update_job_without_errors(self, mock_uuid):
mock_uuid.uuid4.return_value = "abc-123"
def test_update_job_without_errors(self):

job = UploadJob(
"subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4
"subject-123",
"abc-123",
"file1.csv",
"111-222-333",
"domain1",
"dataset2",
4,
)
job.set_step(UploadStep.VALIDATION)
job.set_status(JobStatus.FAILED)
Expand All @@ -836,12 +848,16 @@ def test_update_job_without_errors(self, mock_uuid):
},
)

@patch("api.domain.Jobs.Job.uuid")
def test_update_job_raises_error_when_fails(self, mock_uuid):
mock_uuid.uuid4.return_value = "abc-123"
def test_update_job_raises_error_when_fails(self):

job = UploadJob(
"subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4
"subject-123",
"abc-123",
"file1.csv",
"111-222-333",
"domain1",
"dataset2",
4,
)

self.service_table.update_item.side_effect = ClientError(
Expand Down
10 changes: 5 additions & 5 deletions test/api/application/services/test_data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ def test_raises_error_when_schema_does_not_exist(self, mock_get_version):

with pytest.raises(SchemaNotFoundError):
self.data_service.upload_dataset(
"subject-123", "some", "other", None, Path("data.csv")
"subject-123", "234", "some", "other", None, Path("data.csv")
)

self.s3_adapter.find_schema.assert_called_once_with("some", "other", 1)
Expand Down Expand Up @@ -591,12 +591,12 @@ def test_upload_dataset_triggers_process_upload_and_returns_expected_data(

# WHEN
uploaded_raw_file = self.data_service.upload_dataset(
"subject-123", "some", "other", None, Path("data.csv")
"subject-123", "abc-123", "some", "other", None, Path("data.csv")
)

# THEN
self.job_service.create_upload_job.assert_called_once_with(
"subject-123", "data.csv", "123-456-789", "some", "other", 1
"subject-123", "abc-123", "data.csv", "123-456-789", "some", "other", 1
)
self.data_service.generate_raw_file_identifier.assert_called_once()
mock_thread.assert_called_once_with(
Expand Down Expand Up @@ -825,7 +825,7 @@ def test_upload_dataset_in_chunks_with_invalid_data(

try:
self.data_service.upload_dataset(
"subject-123", "some", "other", 2, Path("data.csv")
"subject-123", "abc-123", "some", "other", 2, Path("data.csv")
)
except DatasetValidationError as error:
assert {
Expand Down Expand Up @@ -860,7 +860,7 @@ def test_upload_dataset_in_chunks_with_invalid_data_in_multiple_chunks(

try:
self.data_service.upload_dataset(
"subject-123", "some", "other", 2, Path("data.csv")
"subject-123", "abc-123", "some", "other", 2, Path("data.csv")
)
except DatasetValidationError as error:
assert {
Expand Down
45 changes: 28 additions & 17 deletions test/api/application/services/test_job_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,11 +383,10 @@ class TestCreateUploadJob:
def setup(self):
self.job_service = JobService()

@patch("api.domain.Jobs.Job.uuid")
@patch.object(DynamoDBAdapter, "store_upload_job")
def test_creates_upload_job(self, mock_store_upload_job, mock_uuid):
def test_creates_upload_job(self, mock_store_upload_job):
# GIVEN
mock_uuid.uuid4.return_value = "abc-123"
job_id = "abc-123"
subject_id = "subject-123"
filename = "file1.csv"
raw_file_identifier = "123-456-789"
Expand All @@ -397,11 +396,11 @@ def test_creates_upload_job(self, mock_store_upload_job, mock_uuid):

# WHEN
result = self.job_service.create_upload_job(
subject_id, filename, raw_file_identifier, domain, dataset, version
subject_id, job_id, filename, raw_file_identifier, domain, dataset, version
)

# THEN
assert result.job_id == "abc-123"
assert result.job_id == job_id
assert result.subject_id == subject_id
assert result.filename == filename
assert result.step == UploadStep.INITIALISATION
Expand Down Expand Up @@ -438,13 +437,17 @@ class TestUpdateJob:
def setup(self):
self.job_service = JobService()

@patch("api.domain.Jobs.Job.uuid")
@patch.object(DynamoDBAdapter, "update_job")
def test_updates_job(self, mock_update_job, mock_uuid):
def test_updates_job(self, mock_update_job):
# GIVEN
mock_uuid.uuid4.return_value = "abc-123"
job = UploadJob(
"subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4
"subject-123",
"abc-123",
"file1.csv",
"111-222-333",
"domain1",
"dataset2",
4,
)

assert job.step == UploadStep.INITIALISATION
Expand Down Expand Up @@ -477,13 +480,17 @@ class TestSucceedsJob:
def setup(self):
self.job_service = JobService()

@patch("api.domain.Jobs.Job.uuid")
@patch.object(DynamoDBAdapter, "update_job")
def test_updates_job(self, mock_update_job, mock_uuid):
def test_updates_job(self, mock_update_job):
# GIVEN
mock_uuid.uuid4.return_value = "abc-123"
job = UploadJob(
"subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4
"subject-123",
"abc-123",
"file1.csv",
"111-222-333",
"domain1",
"dataset2",
4,
)

assert job.status == JobStatus.IN_PROGRESS
Expand Down Expand Up @@ -525,13 +532,17 @@ class TestFailsJob:
def setup(self):
self.job_service = JobService()

@patch("api.domain.Jobs.Job.uuid")
@patch.object(DynamoDBAdapter, "update_job")
def test_updates_job(self, mock_update_job, mock_uuid):
def test_updates_job(self, mock_update_job):
# GIVEN
mock_uuid.uuid4.return_value = "abc-123"
job = UploadJob(
"subject-123", "file1.csv", "111-222-333", "domain1", "dataset2", 4
"subject-123",
"abc-123",
"file1.csv",
"111-222-333",
"domain1",
"dataset2",
4,
)

assert job.status == JobStatus.IN_PROGRESS
Expand Down
Loading

0 comments on commit d119a28

Please sign in to comment.