Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle leading and trailing whitespace in CSV column headers #350

Merged
merged 1 commit into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions project/npda/general_functions/csv_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,17 @@


def read_csv(csv_file):
return pd.read_csv(
csv_file, parse_dates=ALL_DATES, dayfirst=True, date_format="%d/%m/%Y"
)
df = pd.read_csv(csv_file)

# Remove leading and trailing whitespace on column names
# The template published on the RCPCH website has trailing spaces on 'Observation Date: Thyroid Function '
df.columns = df.columns.str.strip()

for column in ALL_DATES:
df[column] = pd.to_datetime(df[column], format="%d/%m/%Y")

return df


async def csv_upload(user, dataframe, csv_file, pdu_pz_code):
"""
Expand Down
55 changes: 54 additions & 1 deletion project/npda/tests/test_csv_upload.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from functools import partial
import dataclasses
import tempfile
from functools import partial
from unittest.mock import AsyncMock, patch

from asgiref.sync import sync_to_async, async_to_sync
Expand Down Expand Up @@ -44,6 +45,12 @@ def mock_remote_calls():
def dummy_sheets_folder(request):
return request.config.rootdir / 'project' / 'npda' / 'dummy_sheets'

@pytest.fixture
def dummy_sheet_csv(dummy_sheets_folder):
file = dummy_sheets_folder / 'dummy_sheet.csv'
with open(file, 'r') as f:
return f.read()

@pytest.fixture
def valid_df(dummy_sheets_folder):
return read_csv(dummy_sheets_folder / 'dummy_sheet.csv')
Expand Down Expand Up @@ -99,6 +106,13 @@ def async_get_all(query_set_fn):
async def csv_upload_sync(user, dataframe, csv_file, pdu_pz_code):
return await csv_upload(user, dataframe, csv_file, pdu_pz_code)

def read_csv_from_str(contents):
with tempfile.NamedTemporaryFile() as f:
f.write(contents.encode())
f.seek(0)

return read_csv(f)


@pytest.mark.django_db
def test_create_patient(test_user, single_row_valid_df):
Expand Down Expand Up @@ -505,3 +519,42 @@ def test_error_looking_up_index_of_multiple_deprivation(test_user, single_row_va

patient = Patient.objects.first()
assert(patient.index_of_multiple_deprivation_quintile is None)


@pytest.mark.django_db
def test_strip_first_spaces_in_column_name(test_user, dummy_sheet_csv):
csv = dummy_sheet_csv.replace("NHS Number", " NHS Number")
df = read_csv_from_str(csv)

assert(df.columns[0] == "NHS Number")

csv_upload_sync(test_user, df, None, ALDER_HEY_PZ_CODE)
patient = Patient.objects.first()

assert(patient.nhs_number == nhs_number.standardise_format(df["NHS Number"][0]))


@pytest.mark.django_db
def test_strip_last_spaces_in_column_name(test_user, dummy_sheet_csv):
csv = dummy_sheet_csv.replace("NHS Number", "NHS Number ")
df = read_csv_from_str(csv)

assert(df.columns[0] == "NHS Number")

csv_upload_sync(test_user, df, None, ALDER_HEY_PZ_CODE)
patient = Patient.objects.first()

assert(patient.nhs_number == nhs_number.standardise_format(df["NHS Number"][0]))


# Originally found in https://github.com/rcpch/national-paediatric-diabetes-audit/actions/runs/11627684066/job/32381466250
# so we have a separate unit test for it
@pytest.mark.django_db
def test_spaces_in_date_column_name(test_user, dummy_sheet_csv):
csv = dummy_sheet_csv.replace("Date of Birth", " Date of Birth")
df = read_csv_from_str(csv)

csv_upload_sync(test_user, df, None, ALDER_HEY_PZ_CODE)
patient = Patient.objects.first()

assert(patient.date_of_birth == df["Date of Birth"][0].date())