Skip to content

Commit

Permalink
Improved LabMetadata stats handling
Browse files Browse the repository at this point in the history
* Revised the stats logic such that we don't need to pd.concat DataFrame

Fixed #501
  • Loading branch information
victorskl committed Sep 2, 2022
1 parent 44a6c5c commit a25cc94
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 19 deletions.
14 changes: 7 additions & 7 deletions data_processors/lims/lambdas/labmetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
import logging
from datetime import datetime

import pandas as pd

from data_processors.lims.services import labmetadata_srv
from libumccr import libjson

Expand Down Expand Up @@ -71,11 +69,13 @@ def scheduled_update_handler(event, context):
logger.warning(f"LabMetadata table is not truncated. Continue with create or update merging strategy.")
# Note we can decide to error out and halt here instead

frames = []
resp_d = {}
for year in years:
logger.info(f"Downloading {year} sheet")
frames.append(labmetadata_srv.download_metadata(year))

df = pd.concat(frames)
df = labmetadata_srv.download_metadata(year)
stats_d = labmetadata_srv.persist_labmetadata(df)
resp_d.update({
year: stats_d
})

return labmetadata_srv.persist_labmetadata(df)
return resp_d
29 changes: 17 additions & 12 deletions data_processors/lims/lambdas/tests/test_labmetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,20 @@ def test_scheduled_update_handler(self):

when(libgdrive).download_sheet(...).thenReturn(pd.read_csv(mock_labmetadata_sheet))

mock_sheet_year = "2021"

result = labmetadata.scheduled_update_handler({
'sheets': ["2021"],
'sheets': [mock_sheet_year],
'truncate': False,
}, None)

logger.info("-" * 32)
logger.info("Example labmetadata.scheduled_update_handler lambda output:")
logger.info(json.dumps(result))

self.assertEqual(result['labmetadata_row_new_count'], 3)
self.assertEqual(result['labmetadata_row_update_count'], 1)
self.assertEqual(result['labmetadata_row_invalid_count'], 0)
self.assertEqual(result[mock_sheet_year]['labmetadata_row_new_count'], 3)
self.assertEqual(result[mock_sheet_year]['labmetadata_row_update_count'], 1)
self.assertEqual(result[mock_sheet_year]['labmetadata_row_invalid_count'], 0)

lib_blank_ext_sample_id = LabMetadata.objects.get(library_id="LIB01")
self.assertEqual(lib_blank_ext_sample_id.external_sample_id, "")
Expand Down Expand Up @@ -160,15 +162,18 @@ def test_labmetadata_truncate(self) -> None:

when(libgdrive).download_sheet(...).thenReturn(pd.read_csv(mock_labmetadata_sheet))

result = labmetadata.scheduled_update_handler({'sheets': ["2020"]}, None) # set only to 1 sheet
mock_sheet_year = "2020"

result = labmetadata.scheduled_update_handler({'sheets': [mock_sheet_year]}, None) # set only to 1 sheet

logger.info("-" * 32)
logger.info("Example labmetadata.scheduled_update_handler lambda output:")
logger.info(json.dumps(result))

self.assertEqual(result['labmetadata_row_new_count'], 4)
self.assertEqual(result['labmetadata_row_update_count'], 0) # no update, everything should be re-created!!
self.assertEqual(result['labmetadata_row_invalid_count'], 0)
self.assertEqual(result[mock_sheet_year]['labmetadata_row_new_count'], 4)
# no update, everything should be re-created!!
self.assertEqual(result[mock_sheet_year]['labmetadata_row_update_count'], 0)
self.assertEqual(result[mock_sheet_year]['labmetadata_row_invalid_count'], 0)

self.assertEqual(4, LabMetadata.objects.count())

Expand Down Expand Up @@ -291,7 +296,7 @@ def test_scheduled_update_handler(self):
logger.info("-" * 32)
logger.info("Example LabMetadataIntegrationTests.scheduled_update_handler lambda output:")
logger.info(json.dumps(result))
self.assertGreater(result['labmetadata_row_new_count'], 1)
self.assertGreater(result['2022']['labmetadata_row_new_count'], 1)

logger.info(f"Total ingested rows into test db: {LabMetadata.objects.count()}")

Expand All @@ -315,9 +320,9 @@ def test_labmetadata_cell_strip(self) -> None:
logger.info("Example labmetadata.scheduled_update_handler lambda output:")
logger.info(json.dumps(result))

self.assertEqual(result['labmetadata_row_new_count'], 7)
self.assertEqual(result['labmetadata_row_update_count'], 0)
self.assertEqual(result['labmetadata_row_invalid_count'], 0)
self.assertEqual(result[year]['labmetadata_row_new_count'], 7)
self.assertEqual(result[year]['labmetadata_row_update_count'], 0)
self.assertEqual(result[year]['labmetadata_row_invalid_count'], 0)
self.assertEqual(7, LabMetadata.objects.count())

lib_79 = LabMetadata.objects.get(library_id='L2200079')
Expand Down

0 comments on commit a25cc94

Please sign in to comment.