Skip to content

Commit

Permalink
formatting
Browse files Browse the repository at this point in the history
Signed-off-by: anchit-chandran <anchit97123@gmail.com>
  • Loading branch information
anchit-chandran committed Nov 4, 2024
1 parent 5231a7d commit ed8fab9
Showing 1 changed file with 58 additions and 24 deletions.
82 changes: 58 additions & 24 deletions project/npda/management/commands/create_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
Will generate 1 csv file with 5 patients, each with 12 visits, with the visit encoding provided.
The HbA1c target range for each visit will be set to 'TARGET'.
The resulting csv will have 5 * 12 = 60 rows (one for each visit).
## Building multiple larger csv files
This can be used to create a spread of data with different ages, visits, hb_targets etc.
Using the `--build` flag will generate a `build` csv file, the same as above, but with
a `build_` filename prefix. The `--coalesce` flag can be used to combine all the build files
into a single csv file.
python manage.py create_csv \
--pts=5 \
--visits="CDCD DHPC ACDC CDCD" \
Expand All @@ -46,7 +46,7 @@
--build \
&& python manage.py create_csv \
--coalesce
Options:
Expand All @@ -73,9 +73,9 @@
- T (TARGET)
- A (ABOVE)
- W (WELL_ABOVE)
--age_range (str, optional):
The possible age range for the patients to be seeded.
The possible age range for the patients to be seeded.
Defaults to 11_15.
- 0_4
- 5_10
Expand Down Expand Up @@ -173,20 +173,23 @@ def add_arguments(self, parser):
"--pts",
type=int,
help="Number of patients to seed.",
required="--coalesce" not in sys.argv, # Set required only if --coalesce is not used
required="--coalesce"
not in sys.argv, # Set required only if --coalesce is not used
)
parser.add_argument(
"--visits",
type=str,
help="Visit types (e.g., 'CDCD DHPC ACDC CDCD'). Can have whitespaces, these will be ignored.",
required="--coalesce" not in sys.argv, # Set required only if --coalesce is not used
required="--coalesce"
not in sys.argv, # Set required only if --coalesce is not used
)
parser.add_argument(
"--hb_target",
type=str,
choices=["T", "A", "W"],
help="HBA1C Target range for visit seeding.",
required="--coalesce" not in sys.argv, # Set required only if --coalesce is not used
required="--coalesce"
not in sys.argv, # Set required only if --coalesce is not used
)
parser.add_argument(
"--submission_date",
Expand Down Expand Up @@ -272,9 +275,13 @@ def handle(self, *args, **options):
self.print_info(f"\n--- Visit Types Provided ---\n")

# Divide the list into chunks of 4 for a compact table
visit_types_chunks = [visit_types[i : i + 4] for i in range(0, len(visit_types), 4)]
visit_types_chunks = [
visit_types[i : i + 4] for i in range(0, len(visit_types), 4)
]
for chunk in visit_types_chunks:
self.print_info(" ".join(f"{CYAN}{visit}{RESET}" for visit in chunk))
self.print_info(
" ".join(f"{CYAN}{visit}{RESET}" for visit in chunk)
)
print()

self.generate_csv(
Expand All @@ -288,9 +295,13 @@ def handle(self, *args, **options):
output_path,
build_flag,
)
self.print_success(f"✨ CSV generated successfully at {self.csv_name}.\n")
self.print_success(
f"✨ CSV generated successfully at {self.csv_name}.\n"
)
if build_flag:
self.print_info(f"Coalesce the build csv files using the --coalesce flag.")
self.print_info(
f"Coalesce the build csv files using the --coalesce flag."
)

def generate_csv(
self,
Expand Down Expand Up @@ -355,7 +366,9 @@ def generate_csv(
csv_heading,
) in field_heading_mappings.items():
if model == "Visit":
visit_dict[csv_heading] = getattr(visit, model_field)
visit_dict[csv_heading] = getattr(
visit, model_field
)
elif model == "Patient":
# Foreign key so need to manually set the value
if model_field == "pdu":
Expand Down Expand Up @@ -394,10 +407,14 @@ def _run_coalesce(self, **options):

# Get the existing build files
existing_build_files = [
f for f in os.listdir(options["output_path"]) if f.startswith("build")
f
for f in os.listdir(options["output_path"])
if f.startswith("build")
]
if not existing_build_files:
self.print_error(f"No build files to coalesce in {options['output_path']}/")
self.print_error(
f"No build files to coalesce in {options['output_path']}/"
)
return
self.print_info(f"{CYAN}Existing build files: {RESET}\n")
for file in existing_build_files:
Expand All @@ -424,17 +441,22 @@ def _run_coalesce(self, **options):

df.info()

csv_file_name = f"coalesced_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv"
csv_file_name = (
f"coalesced_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv"
)
full_csv_path = os.path.join(options["output_path"], csv_file_name)
df.to_csv(
full_csv_path,
index=False,
)

self.print_success(f"\n✨ CSV coalesced successfully at {full_csv_path}.\n")
self.print_success(
f"\n✨ CSV coalesced successfully at {full_csv_path}.\n"
)

# PRINT OUT DIFFERENCE IN DATA TYPES
orig = pd.read_csv("project/npda/dummy_sheets/dummy_sheet_invalid.csv")
comparison_csv = "dummy_sheet_invalid.csv"
orig = pd.read_csv(f"project/npda/dummy_sheets/{comparison_csv}")
# Get data types for both DataFrames
orig_dtypes = orig.dtypes
new_dtypes = df.dtypes
Expand All @@ -445,12 +467,17 @@ def _run_coalesce(self, **options):
mismatched_dtypes[col] = (orig_dtypes[col], new_dtypes[col])

# Print out mismatched columns and their respective data types
self.print_error("Columns with differing data types:")
self.print_error(
f"Columns with differing data types from {comparison_csv}:"
)
self.print_info("NOTE: columns with Nan are cast to float")
for col, (orig_type, new_type) in mismatched_dtypes.items():
print(
f"{col}: original type = {GREEN}{orig_type}{RESET}, coalesced type = {CYAN}{new_type}{RESET}"
)
print(
f"Value in original: {GREEN}{orig[col].iloc[0]}{RESET}, value in coalesced: {CYAN}{df[col].iloc[0]}{RESET}\n"
)

return

Expand All @@ -471,6 +498,7 @@ def _set_valid_dtypes(self, df: pd.DataFrame) -> pd.DataFrame:
**{
# Convert mismatched columns to the correct data type
"NHS Number": lambda x: x["NHS Number"].astype(str),

},
)
# Reorder columns
Expand All @@ -493,12 +521,16 @@ def _parse_values_from_options(self, **options):
datetime.strptime(submission_date_str, "%Y-%m-%d")
).date()
except ValueError:
self.print_error("Invalid submission_date format. Use YYYY-MM-DD.")
self.print_error(
"Invalid submission_date format. Use YYYY-MM-DD."
)
return
else:
submission_date = timezone.now().date()

audit_start_date, audit_end_date = get_audit_period_for_date(submission_date)
audit_start_date, audit_end_date = get_audit_period_for_date(
submission_date
)

# Number of patients to seed (pts)
n_pts_to_seed = options["pts"]
Expand Down Expand Up @@ -552,7 +584,9 @@ def _get_file_name(
building_str = ""
if build:
# First count the number of existing files to use this as filename prefix
existing_files = [f for f in os.listdir(output_path) if f.startswith("build")]
existing_files = [
f for f in os.listdir(output_path) if f.startswith("build")
]

# Set the building string filename prefix
building_str = f"build__{len(existing_files) + 1}_"
Expand Down

0 comments on commit ed8fab9

Please sign in to comment.