diff --git a/data-serving/data-service/src/controllers/case.ts b/data-serving/data-service/src/controllers/case.ts index 00ae385eb..7a612559e 100644 --- a/data-serving/data-service/src/controllers/case.ts +++ b/data-serving/data-service/src/controllers/case.ts @@ -58,7 +58,7 @@ const caseFromDTO = async (receivedCase: CaseDTO) => { const dtoFromCase = async (storedCase: LeanDocument) => { let dto = storedCase as unknown as CaseDTO; - if (storedCase.demographics.ageBuckets && storedCase.demographics.ageBuckets.length > 0) { + if (storedCase.demographics && storedCase.demographics.ageBuckets && storedCase.demographics.ageBuckets.length > 0) { const ageBuckets = await Promise.all(storedCase.demographics.ageBuckets.map((bucketId) => { return AgeBucket.findById(bucketId).lean() })); diff --git a/data-serving/scripts/export-data/README.md b/data-serving/scripts/export-data/README.md index 884b75b7a..a8973decc 100644 --- a/data-serving/scripts/export-data/README.md +++ b/data-serving/scripts/export-data/README.md @@ -37,7 +37,7 @@ during the build phase: You will need write access to S3. Alternatively, you can setup localstack to have a mock S3 environment. To run a hypothetical export for Antarctica: - docker run -e 'COUNTRY=Antarctica' -e 'CONN=xx' country-export + docker run -e 'COUNTRY=Antarctica' -e 'CONN=xx' -e 'BUCKET=yyy' country-export ## Setting up exports diff --git a/data-serving/scripts/export-data/country_export.sh b/data-serving/scripts/export-data/country_export.sh index 6ff28a7fc..3758dab1a 100755 --- a/data-serving/scripts/export-data/country_export.sh +++ b/data-serving/scripts/export-data/country_export.sh @@ -7,10 +7,16 @@ require_env "${CONN:-}" "Specify MongoDB connection string in CONN" require_env "${BUCKET:-}" "Specify S3 bucket to output files in BUCKET" require_env "${COUNTRY:-}" "Specify which country code to export in COUNTRY" +SCRATCH="$(mktemp -d)" +BUCKETS="${SCRATCH}/buckets.json" +trap 'rm -rf "$SCRATCH"' EXIT # Cleanup before exit + FORMAT="${FORMAT:-csv,tsv,json}" QUERY="{\"list\": true, \"location.country\": \"$COUNTRY\"}" + +mongoexport --uri="$CONN" --collection=agebuckets --type=json > "${BUCKETS}" mongoexport --query="$QUERY" --uri="$CONN" --collection=cases \ - --fieldFile=fields.txt --type=csv | python3 transform.py -f "$FORMAT" "$COUNTRY" + --fieldFile=fields.txt --type=csv | python3 transform.py -f "$FORMAT" -b "${BUCKETS}" "$COUNTRY" # ignore shellcheck warning on word splitting, as it's actually needed here # shellcheck disable=SC2086 diff --git a/data-serving/scripts/export-data/fields.txt b/data-serving/scripts/export-data/fields.txt index 6d73a553f..907bc2106 100644 --- a/data-serving/scripts/export-data/fields.txt +++ b/data-serving/scripts/export-data/fields.txt @@ -5,8 +5,7 @@ caseReference.sourceId caseReference.sourceUrl caseReference.uploadIds caseReference.verificationStatus -demographics.ageRange.end -demographics.ageRange.start +demographics.ageBuckets demographics.ethnicity demographics.gender demographics.nationalities diff --git a/data-serving/scripts/export-data/test_age_buckets.json b/data-serving/scripts/export-data/test_age_buckets.json new file mode 100644 index 000000000..47207b526 --- /dev/null +++ b/data-serving/scripts/export-data/test_age_buckets.json @@ -0,0 +1,22 @@ +[ + { + "_id": "001", + "start": 0, + "end": 0 + }, + { + "_id": "002", + "start": 1, + "end": 5 + }, + { + "_id": "003", + "start": 6, + "end": 9 + }, + { + "_id": "004", + "start": 10, + "end": 14 + } +] \ No newline at end of file diff --git a/data-serving/scripts/export-data/test_transform.py b/data-serving/scripts/export-data/test_transform.py index 44a13e785..9c6b88573 100644 --- a/data-serving/scripts/export-data/test_transform.py +++ b/data-serving/scripts/export-data/test_transform.py @@ -77,6 +77,19 @@ "travelHistory.travel.methods": "Ship,Raft", } +_BUCKETS = [ + { + "_id": "001", + "start": 20, + "end": 24, + }, + { + "_id": "002", + "start": 25, + "end": 29, + } +] + def _read_csv(fn): with open(fn) as f: @@ -107,7 +120,7 @@ def test_convert_travel(): def test_transform_output_match(fmt): expected = Path(f'test_transform_mongoexport_expected.{fmt}').read_text() with redirect_stdout(io.StringIO()) as f: - T.transform('test_transform_mongoexport.csv', '-', [fmt]) + T.transform('test_transform_mongoexport.csv', '-', [fmt], "test_age_buckets.json") # use str.splitlines to ignore line endings expected_lines = expected.splitlines() @@ -120,13 +133,44 @@ def test_transform_output_match(fmt): def test_transform_empty(tmp_path): output = f"{tmp_path}/empty" - T.transform('test_transform_mongoexport_header.csv', output, ['csv']) + T.transform('test_transform_mongoexport_header.csv', output, ['csv'], "test_age_buckets.json") assert not Path(f"{output}.csv.gz").exists() def test_transform_creates_output(tmp_path): formats = ['csv', 'tsv', 'json'] output = f"{tmp_path}/output" - T.transform('test_transform_mongoexport.csv', output, formats) + T.transform('test_transform_mongoexport.csv', output, formats, "test_age_buckets.json") for fmt in formats: assert Path(f"{output}.{fmt}.gz").exists() + + +def test_transform_buckets_age_ranges(): + expected = Path(f'test_transform_mongoexport_bucketed_ages_expected.csv').read_text() + with redirect_stdout(io.StringIO()) as f: + T.transform('test_transform_mongoexport_bucketed_ages.csv', '-', ['csv'], 'test_age_buckets.json') + + expected_lines = expected.splitlines() + actual_lines = f.getvalue().splitlines() + + lines_to_compare = zip(expected_lines, actual_lines) + for line_pair in lines_to_compare: + assert line_pair[0] == line_pair[1] + + +def test_age_bucket_conversion(): + case_buckets_json = "[\"001\", \"002\"]" + (start, end) = T.age_range(case_buckets_json, _BUCKETS) + assert start == 20 + assert end == 29 + + +def test_age_bucket_row_conversion(): + row = { + "_id": "ObjectId(abc123)", + "travelHistory.traveledPrior30Days": "false", + "demographics.ageBuckets": "[\"001\"]" + } + converted_row = T.convert_row(row, _BUCKETS) + assert converted_row["demographics.ageRange.start"] == 20 + assert converted_row["demographics.ageRange.end"] == 24 diff --git a/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages.csv b/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages.csv new file mode 100644 index 000000000..7ca557a3c --- /dev/null +++ b/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages.csv @@ -0,0 +1,3 @@ +_id,caseReference.additionalSources,caseReference.sourceEntryId,caseReference.sourceId,caseReference.sourceUrl,caseReference.uploadIds,caseReference.verificationStatus,demographics.ageBuckets,demographics.ethnicity,demographics.gender,demographics.nationalities,demographics.occupation,events,location.administrativeAreaLevel1,location.administrativeAreaLevel2,location.administrativeAreaLevel3,location.country,location.geoResolution,location.geometry.latitude,location.geometry.longitude,location.name,location.place,location.query,notes,pathogens,preexistingConditions.hasPreexistingConditions,preexistingConditions.values,revisionMetadata.creationMetadata.curator,revisionMetadata.creationMetadata.date,revisionMetadata.creationMetadata.notes,revisionMetadata.editMetadata.curator,revisionMetadata.editMetadata.date,revisionMetadata.editMetadata.notes,revisionMetadata.revisionNumber,SGTF,symptoms.status,symptoms.values,transmission.linkedCaseIds,transmission.places,transmission.routes,travelHistory.travel.dateRange.end,travelHistory.travel.dateRange.start,travelHistory.travel.location.name,travelHistory.travel.methods,travelHistory.travel.purpose,travelHistory.traveledPrior30Days,vaccines.0.name,vaccines.0.batch,vaccines.0.date,vaccines.0.sideEffects,vaccines.1.name,vaccines.1.batch,vaccines.1.date,vaccines.1.sideEffects,vaccines.2.name,vaccines.2.batch,vaccines.2.date,vaccines.2.sideEffects,vaccines.3.name,vaccines.3.batch,vaccines.3.date,vaccines.3.sideEffects +ObjectId(6817283abaa89324a90109aa),[],,787123878aa90909811aaff1,http://foo/bar.csv,"[""bb12399abbb19230900aa123""]",UNVERIFIED,"[""001"", ""002""]",,Male,[],,"[{""name"":""confirmed"",""dateRange"":{""start"":{""$date"":""2021-10-01T00:00:00.000Z""},""end"":{""$date"":""2021-01-01T00:00:00.000Z""}}}]",,,,Antarctica,Country,-79.402,0.323,Antarctica,,,,[],,,ingestion@example.com,2021-01-02T13:42:34.991Z,,,,,0,,,,,,,,,,,true,,,,,,,,,,,,,,,, +ObjectId(798989a98998acc98989a1bb),[],,787123878aa90909811aaff1,http://foo/bar.csv,"[""bb12399abbb19230900aa123""]",UNVERIFIED,"[""003""]",,Female,[],,"[{""name"":""confirmed"",""dateRange"":{""start"":{""$date"":""2021-01-05T00:00:00.000Z""},""end"":{""$date"":""2021-01-05T00:00:00.000Z""}}}]",,,,Antarctica,Country,-79.402,0.323,Antarctica,,,,[],,,ingestion@example.com,2021-01-02T13:42:34.991Z,,,,,0,,,,,,,,,,,true,,,,,,,,,,,,,,,, diff --git a/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages_expected.csv b/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages_expected.csv new file mode 100644 index 000000000..96f149940 --- /dev/null +++ b/data-serving/scripts/export-data/test_transform_mongoexport_bucketed_ages_expected.csv @@ -0,0 +1,3 @@ +_id,caseReference.additionalSources,caseReference.sourceEntryId,caseReference.sourceId,caseReference.sourceUrl,caseReference.uploadIds,caseReference.verificationStatus,demographics.ageRange.end,demographics.ageRange.start,demographics.ethnicity,demographics.gender,demographics.nationalities,demographics.occupation,events.confirmed.date,events.confirmed.value,events.firstClinicalConsultation.date,events.hospitalAdmission.date,events.hospitalAdmission.value,events.icuAdmission.date,events.icuAdmission.value,events.onsetSymptoms.date,events.outcome.date,events.outcome.value,events.selfIsolation.date,location.administrativeAreaLevel1,location.administrativeAreaLevel2,location.administrativeAreaLevel3,location.country,location.geometry.latitude,location.geometry.longitude,location.geoResolution,location.name,location.place,notes,pathogens,preexistingConditions.hasPreexistingConditions,preexistingConditions.values,revisionMetadata.creationMetadata.date,revisionMetadata.creationMetadata.notes,revisionMetadata.editMetadata.date,revisionMetadata.editMetadata.notes,revisionMetadata.revisionNumber,SGTF,symptoms.status,symptoms.values,transmission.linkedCaseIds,transmission.places,transmission.routes,travelHistory.travel.dateRange.end,travelHistory.travel.dateRange.start,travelHistory.travel.location.administrativeAreaLevel1,travelHistory.travel.location.administrativeAreaLevel2,travelHistory.travel.location.administrativeAreaLevel3,travelHistory.travel.location.country,travelHistory.travel.location.geometry.coordinates,travelHistory.travel.location.geoResolution,travelHistory.travel.location.name,travelHistory.travel.location.place,travelHistory.travel.methods,travelHistory.travel.purpose,travelHistory.traveledPrior30Days,vaccines.0.batch,vaccines.0.date,vaccines.0.name,vaccines.0.sideEffects,vaccines.1.batch,vaccines.1.date,vaccines.1.name,vaccines.1.sideEffects,vaccines.2.batch,vaccines.2.date,vaccines.2.name,vaccines.2.sideEffects,vaccines.3.batch,vaccines.3.date,vaccines.3.name,vaccines.3.sideEffects,variantOfConcern +ObjectId(6817283abaa89324a90109aa),,,787123878aa90909811aaff1,http://foo/bar.csv,bb12399abbb19230900aa123,UNVERIFIED,5,0,,Male,,,2021-01-01,,,,,,,,,,,,,,Antarctica,-79.402,0.323,Country,Antarctica,,,,,,2021-01-02T13:42:34.991Z,,,,0,NA,,,,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,,, +ObjectId(798989a98998acc98989a1bb),,,787123878aa90909811aaff1,http://foo/bar.csv,bb12399abbb19230900aa123,UNVERIFIED,9,6,,Female,,,2021-01-05,,,,,,,,,,,,,,Antarctica,-79.402,0.323,Country,Antarctica,,,,,,2021-01-02T13:42:34.991Z,,,,0,NA,,,,,,,,,,,,,,,,,true,,,,,,,,,,,,,,,,,, diff --git a/data-serving/scripts/export-data/transform.py b/data-serving/scripts/export-data/transform.py index c7e240d26..bf58beae5 100644 --- a/data-serving/scripts/export-data/transform.py +++ b/data-serving/scripts/export-data/transform.py @@ -184,6 +184,8 @@ def get_headers_and_fields(fileobject) -> list[str]: logging.exception("Error in reading mongoexport header") sys.exit(1) cols_to_add = [ + "demographics.ageRange.start", + "demographics.ageRange.end", "events.confirmed.value", "events.confirmed.date", "events.firstClinicalConsultation.date", @@ -196,21 +198,33 @@ def get_headers_and_fields(fileobject) -> list[str]: "events.outcome.value", "events.selfIsolation.date", ] + cols_to_remove = [ + "demographics.ageBuckets", + ] fields = set(headers).union(set(cols_to_add)) fields = fields.union(set(__TRAVEL + __VARIANT)) + fields = fields.difference(cols_to_remove) fields = sorted(list(fields - set(__OMIT)), key=str.casefold) return headers, fields -def convert_row(row: dict[str, Any]) -> Optional[dict[str, Any]]: +def age_range(case_buckets: str, buckets: [dict[str, Any]]) -> (int, int): + bucket_ids = json.loads(case_buckets) + matching_buckets = [b for b in buckets if b["_id"] in bucket_ids] + min_age = min([b["start"] for b in matching_buckets]) + max_age = max([b["end"] for b in matching_buckets]) + return (min_age, max_age) + + +def convert_row(row: dict[str, Any], buckets: [dict[str, Any]]) -> Optional[dict[str, Any]]: if "ObjectId" not in row["_id"]: return None - if type(row["notes"]) == str: + if type(row.get("notes")) == str: row["notes"] = row["notes"].replace("\n", ", ") for arr_field in __ARRAYS: - if row[arr_field]: + if row.get(arr_field): row[arr_field] = convert_string_list(row[arr_field]) - if row["caseReference.additionalSources"]: + if row.get("caseReference.additionalSources"): row["caseReference.additionalSources"] = convert_addl_sources( row["caseReference.additionalSources"] ) @@ -222,6 +236,9 @@ def convert_row(row: dict[str, Any]) -> Optional[dict[str, Any]]: if row["travelHistory.traveledPrior30Days"] == "true": if "travelHistory.travel" in row: row.update(convert_travel(row["travelHistory.travel"])) + if row.get("demographics.ageBuckets", None): + (row["demographics.ageRange.start"], row["demographics.ageRange.end"]) = age_range(row["demographics.ageBuckets"], buckets) + del row["demographics.ageBuckets"] return row @@ -288,19 +305,21 @@ def open_writers(formats: list[str], fields: list[str], output: str): files[fmt].close() -def transform(input: Optional[str], output: str, formats: list[str]): +def transform(input: Optional[str], output: str, formats: list[str], bucketpath: str): with (open(input) if input else sys.stdin) as inputfile: - headers, fields = get_headers_and_fields(inputfile) - reader = csv.DictReader(inputfile, fieldnames=headers) - hasrows = False - with open_writers(formats, fields, output) as writers: - for i, row in enumerate(map(convert_row, reader)): - hasrows = True - writerow(formats, writers, row, i) - if output != "-" and not hasrows: # cleanup empty files - cleanup_files = [Path(f"{output}.{fmt}.gz") for fmt in formats] - for file in cleanup_files: - file.unlink(missing_ok=True) + with open(bucketpath) as bucketfile: + buckets = json.load(bucketfile) + headers, fields = get_headers_and_fields(inputfile) + reader = csv.DictReader(inputfile, fieldnames=headers) + hasrows = False + with open_writers(formats, fields, output) as writers: + for i, row in enumerate(map(lambda row: convert_row(row, buckets), reader)): + hasrows = True + writerow(formats, writers, row, i) + if output != "-" and not hasrows: # cleanup empty files + cleanup_files = [Path(f"{output}.{fmt}.gz") for fmt in formats] + for file in cleanup_files: + file.unlink(missing_ok=True) if __name__ == "__main__": @@ -318,5 +337,11 @@ def transform(input: Optional[str], output: str, formats: list[str]): "--input", help="Input file to transform instead of stdin" ) + parser.add_argument( + "-b", + "--buckets", + help="JSON collection of age buckets to determine case age ranges", + required=True + ) args = parser.parse_args() - transform(args.input, args.output, args.format.split(",")) + transform(args.input, args.output, args.format.split(","), args.buckets)