Skip to content

Commit

Permalink
Remove source entry ID #2714
Browse files Browse the repository at this point in the history
This obviates the problem with trying to keep hashed source entry IDs stable.

See discussion in #2739
  • Loading branch information
iamleeg committed Jun 29, 2022
1 parent adf6b79 commit 4b850ef
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 115 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from flask import jsonify
from datetime import date
from hashlib import sha256
from reusable_data_service.model.case import Case
from reusable_data_service.model.filter import (
Anything,
Expand Down Expand Up @@ -64,7 +63,7 @@ def create_case(self, maybe_case: dict, num_cases: int = 1):
if num_cases <= 0:
return "Must create a positive number of cases", 400
try:
case = self.create_anonymised_case_if_valid(maybe_case)
case = self.create_case_if_valid(maybe_case)
for i in range(num_cases):
self.store.insert_case(case)
return "", 201
Expand All @@ -78,7 +77,7 @@ def create_case(self, maybe_case: dict, num_cases: int = 1):
def validate_case_dictionary(self, maybe_case: dict):
"""Check whether a case _could_ be valid, without storing it if it is."""
try:
case = self.create_anonymised_case_if_valid(maybe_case)
case = self.create_case_if_valid(maybe_case)
return "", 204
except ValueError as ve:
# ValueError means we can't even turn this into a case
Expand All @@ -103,41 +102,28 @@ def batch_upsert(self, body: dict):
usable_cases = []
for i, maybe_case in enumerate(cases):
try:
case = self.create_anonymised_case_if_valid(maybe_case)
case = self.create_case_if_valid(maybe_case)
usable_cases.append(case)
except Exception as e:
errors[str(i)] = e.args[0]
(created, updated) = self.store.batch_upsert(usable_cases) if len(usable_cases) > 0 else (0,0)
(created, updated) = (
self.store.batch_upsert(usable_cases) if len(usable_cases) > 0 else (0, 0)
)
status = 200 if len(errors) == 0 else 207
response = {"numCreated": created, "numUpdated": updated, "errors": errors}
return jsonify(response), status

def create_anonymised_case_if_valid(self, maybe_case: dict):
def create_case_if_valid(self, maybe_case: dict):
"""Attempts to create a case from an input dictionary and validate it against
the application rules. Implements anonymisation rules so that cases cannot be
reidentified against external source data.
Raises ValueError or PreconditionError on invalid input."""
the application rules. Raises ValueError or PreconditionError on invalid input."""
case = Case.from_dict(maybe_case)
self.check_case_preconditions(case)
CaseController.anonymise_case(case)
return case

def check_case_preconditions(self, case: Case):
if case.confirmationDate < self.outbreak_date:
raise PreconditionError("Confirmation date is before outbreak began")

@staticmethod
def anonymise_case(case: Case):
"""Ensure that a stable one-way hash of the source entry ID is stored, and
not the source entry ID itself."""
if (
case.caseReference is not None
and case.caseReference.sourceEntryId is not None
):
case.caseReference.sourceEntryId = sha256(
case.caseReference.sourceEntryId.encode("utf-8")
).hexdigest()

@staticmethod
def parse_filter(filter: str) -> Filter:
"""Interpret the filter query in the incoming request."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ def validate(self):
raise ValueError("Confirmation Date is mandatory")
elif self.confirmationDate is None:
raise ValueError("Confirmation Date must have a value")
if not hasattr(self, "caseReference"):
raise ValueError("Case Reference is mandatory")
elif self.caseReference is None:
raise ValueError("Case Reference must have a value")
self.caseReference.validate()

def to_dict(self):
"""Return myself as a dictionary."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@ class CaseReference:

_: dataclasses.KW_ONLY
sourceId: bson.ObjectId = dataclasses.field(init=False, default=None)
sourceEntryId: str = dataclasses.field(init=False, default=None)

def validate(self):
"""Check whether I am consistent. Raise ValueError if not."""
if not hasattr(self, "sourceId"):
raise ValueError("Source ID is mandatory")
elif self.sourceId is None:
raise ValueError("Source ID must have a value")

@staticmethod
def from_dict(d: dict[str, str]):
Expand All @@ -22,6 +28,4 @@ def from_dict(d: dict[str, str]):
ref.sourceId = bson.ObjectId(theId["$oid"])
else:
raise ValueError(f"Cannot interpret {theId} as an ObjectId")
if "sourceEntryId" in d:
ref.sourceEntryId = d["sourceEntryId"]
return ref
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,10 @@ def insert_case(self, case: Case):
self.get_case_collection().insert_one(to_insert)

def batch_upsert(self, cases: List[Case]) -> Tuple[int, int]:
dicts = [MongoStore.case_to_bson_compatible_dict(c) for c in cases]
needs_insert = lambda d: d['caseReference'] is None or d['caseReference']['sourceId'] is None or d['caseReference']['sourceEntryId'] is None
to_insert = [d for d in dicts if needs_insert(d)]
to_upsert = [d for d in dicts if not needs_insert(d)]
to_insert = [MongoStore.case_to_bson_compatible_dict(c) for c in cases]
inserts = [pymongo.InsertOne(d) for d in to_insert]
replacements = [pymongo.ReplaceOne(filter = {
'caseReference.sourceId': d['caseReference']['sourceId'],
'caseReference.sourceEntryId': d['caseReference']['sourceEntryId'],
}, replacement = d, upsert = True) for d in to_upsert]
results = self.get_case_collection().bulk_write(inserts + replacements)
return results.inserted_count, results.modified_count
results = self.get_case_collection().bulk_write(inserts)
return results.inserted_count, 0

@staticmethod
def setup():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
{
"confirmationDate": "2021-12-31T01:23:45.678Z"
"confirmationDate": "2021-12-31T01:23:45.678Z",
"caseReference": {
"sourceId": "fedc09876543210987654321"
}
}
90 changes: 24 additions & 66 deletions data-serving/reusable-data-service/tests/test_case_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ class MemoryStore:
def __init__(self):
self.cases = dict()
self.next_id = 0
self.upsert_create_count = 0

def case_by_id(self, id: str):
return self.cases.get(id)
Expand All @@ -33,19 +32,9 @@ def count_cases(self, *args):
return len(self.cases)

def batch_upsert(self, cases: List[Case]):
"""For testing the case controller, a trivial implementation. Look to
tests of the stores and integration tests for richer expressions of
behaviour; otherwise we end up duplicating a lot of the upsert logic
in this test double."""
original_create_count = self.upsert_create_count
for case in cases:
if self.upsert_create_count > 0:
self.insert_case(case)
self.upsert_create_count -= 1
else:
# don't do anything, pretending a case was updated
pass
return original_create_count, len(cases) - original_create_count
self.insert_case(case)
return len(cases), 0


@pytest.fixture
Expand Down Expand Up @@ -130,29 +119,43 @@ def test_create_case_with_missing_properties_400_error(case_controller):

def test_create_case_with_invalid_data_422_error(case_controller):
(response, status) = case_controller.create_case(
{"confirmationDate": date(2001, 3, 17)}
{
"confirmationDate": date(2001, 3, 17),
"caseReference": {"sourceId": "123ab4567890123ef4567890"},
}
)
assert status == 422


def test_create_valid_case_adds_to_collection(case_controller):
(response, status) = case_controller.create_case(
{"confirmationDate": date(2021, 6, 3)}
{
"confirmationDate": date(2021, 6, 3),
"caseReference": {"sourceId": "123ab4567890123ef4567890"},
}
)
assert status == 201
assert case_controller.store.count_cases() == 1


def test_create_valid_case_with_negative_count_400_error(case_controller):
(response, status) = case_controller.create_case(
{"confirmationDate": date(2021, 6, 3)}, num_cases=-7
{
"confirmationDate": date(2021, 6, 3),
"caseReference": {"sourceId": "123ab4567890123ef4567890"},
},
num_cases=-7,
)
assert status == 400


def test_create_valid_case_with_positive_count_adds_to_collection(case_controller):
(response, status) = case_controller.create_case(
{"confirmationDate": date(2021, 6, 3)}, num_cases=7
{
"confirmationDate": date(2021, 6, 3),
"caseReference": {"sourceId": "123ab4567890123ef4567890"},
},
num_cases=7,
)
assert status == 201
assert case_controller.store.count_cases() == 7
Expand All @@ -167,7 +170,10 @@ def test_validate_case_with_valid_case_returns_204_and_does_not_add_case(
case_controller,
):
(response, status) = case_controller.validate_case_dictionary(
{"confirmationDate": date(2021, 6, 3)}
{
"confirmationDate": date(2021, 6, 3),
"caseReference": {"sourceId": "123ab4567890123ef4567890"},
}
)
assert status == 204
assert case_controller.store.count_cases() == 0
Expand All @@ -191,7 +197,6 @@ def test_batch_upsert_with_empty_case_list_returns_400(case_controller):
def test_batch_upsert_creates_valid_case(case_controller):
with open("./tests/data/case.minimal.json", "r") as minimal_file:
minimal_case_description = json.loads(minimal_file.read())
case_controller.store.upsert_create_count = 1 # store should create this case
(response, status) = case_controller.batch_upsert(
{"cases": [minimal_case_description]}
)
Expand All @@ -202,56 +207,9 @@ def test_batch_upsert_creates_valid_case(case_controller):
assert response.json["errors"] == {}


def test_batch_upsert_updates_valid_case(case_controller):
with open("./tests/data/case.minimal.json", "r") as minimal_file:
minimal_case_description = json.loads(minimal_file.read())
case_controller.store.upsert_create_count = 0 # store should update this case
(response, status) = case_controller.batch_upsert(
{"cases": [minimal_case_description]}
)
assert status == 200
assert response.json["numCreated"] == 0
assert response.json["numUpdated"] == 1
assert response.json["errors"] == {}


def test_batch_upsert_reports_both_updates_and_inserts(case_controller):
with open("./tests/data/case.minimal.json", "r") as minimal_file:
minimal_case_description = json.loads(minimal_file.read())
case_controller.store.upsert_create_count = (
1 # store should create one, update other
)
(response, status) = case_controller.batch_upsert(
{"cases": [minimal_case_description, minimal_case_description]}
)
assert status == 200
assert response.json["numCreated"] == 1
assert response.json["numUpdated"] == 1
assert response.json["errors"] == {}


def test_batch_upsert_reports_errors(case_controller):
case_controller.store.upsert_create_count = (
0 # store won't have anything to do in this test anyway
)
(response, status) = case_controller.batch_upsert({"cases": [{}]})
assert status == 207
assert response.json["numCreated"] == 0
assert response.json["numUpdated"] == 0
assert response.json["errors"] == {"0": "Confirmation Date is mandatory"}


def test_batch_upsert_hides_original_source_entry_id(case_controller):
case_controller.store.upsert_create_count = (
1 # create the case so we can read it later
)
with open("./tests/data/case.minimal.json", "r") as minimal_file:
case = json.loads(minimal_file.read())
case["caseReference"] = {
"sourceId": "12345678901234567890abcd",
"sourceEntryId": "foo",
}
(response, status) = case_controller.batch_upsert({"cases": [case]})
assert status == 200
retrieved_case = case_controller.store.case_by_id("1")
assert retrieved_case.caseReference.sourceEntryId != "foo"
Loading

0 comments on commit 4b850ef

Please sign in to comment.