diff --git a/.fides/redis_dataset.yml b/.fides/redis_dataset.yml index 56f66e7bf5..c2c3831df3 100644 --- a/.fides/redis_dataset.yml +++ b/.fides/redis_dataset.yml @@ -20,6 +20,17 @@ dataset: data_categories: [system.operations] fidesops_meta: data_type: string[] # List of edges between the upstream collection and the current collection + - name: EN_DATA_USE_MAP__ + description: This map of traversed `Collection`s to associated `DataUse`s is stored and retrieved to be included in access request output packages. + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + fidesops_meta: + data_type: object # Dict mapping `Collection` addresses -> set of associated `DataUse`s + fields: + - name: : # `Collection` address + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + data_categories: [system.operations] + fidesops_meta: + data_type: string[] # set of `DataUse`s associated with this `Collection` - name: EN_EMAIL_INFORMATION________ # Usage: For building emails associated with email-connector datasets at the end of the privacy request. This encrypted raw information is retrieved from each relevant email-based collection and used to build a single email per email connector, with instructions on how to mask data on the given dataset. fidesops_meta: data_type: object # Stores how to locate and mask records for a given "email" collection. diff --git a/src/fides/api/service/storage/storage_uploader_service.py b/src/fides/api/service/storage/storage_uploader_service.py index 1ef6b10be7..a94143add1 100644 --- a/src/fides/api/service/storage/storage_uploader_service.py +++ b/src/fides/api/service/storage/storage_uploader_service.py @@ -98,7 +98,7 @@ def _s3_uploader( auth_method = config.details[StorageDetails.AUTH_METHOD.value] return upload_to_s3( - config.secrets, data, bucket_name, file_key, config.format.value, privacy_request, auth_method, data_category_field_mapping # type: ignore + config.secrets, data, bucket_name, file_key, config.format.value, privacy_request, auth_method, data_category_field_mapping, data_use_map # type: ignore ) @@ -112,4 +112,4 @@ def _local_uploader( ) -> str: """Uploads data to local storage, used for quick-start/demo purposes""" file_key: str = _construct_file_key(privacy_request.id, config) - return upload_to_local(data, file_key, privacy_request, config.format.value, data_category_field_mapping) # type: ignore + return upload_to_local(data, file_key, privacy_request, config.format.value, data_category_field_mapping, data_use_map) # type: ignore diff --git a/src/fides/api/task/graph_task.py b/src/fides/api/task/graph_task.py index 15137f17b6..c156456dd7 100644 --- a/src/fides/api/task/graph_task.py +++ b/src/fides/api/task/graph_task.py @@ -671,15 +671,18 @@ def _format_data_use_map_for_caching( ) -> Dict[str, Set[str]]: """ Create a map of `Collection`s mapped to their associated `DataUse`s - to be stored in the cache. + to be stored in the cache. This is done before request execution, so that we + maintain the _original_ state of the graph as it's used for request execution. + The graph is subject to change "from underneath" the request execution runtime, + but we want to avoid picking up those changes in our data use map. `DataUse`s are associated with a `Collection` by means of the `System` that's linked to a `Collection`'s `Connection` definition. Example: { - : ["data_use_1", "data_use_2"], - : ["data_use_1"], + : {"data_use_1", "data_use_2"}, + : {"data_use_1"}, } """ return {collection.value: g_task.data_uses for collection, g_task in env.items()} @@ -742,7 +745,14 @@ def termination_fn( privacy_request, env, end_nodes, resources, ActionType.access ) ) + + # cache access graph for use in logging/analytics event privacy_request.cache_access_graph(format_graph_for_caching(env, end_nodes)) + + # cache a map of collections -> data uses for the output package of access requests + # this is cached here before request execution, since this is the state of the + # graph used for request execution. the graph could change _during_ request execution, + # but we don't want those changes in our data use map. privacy_request.cache_data_use_map(_format_data_use_map_for_caching(env)) v = delayed(get(dsk, TERMINATOR_ADDRESS, num_workers=1)) diff --git a/tests/ops/service/privacy_request/test_request_runner_service.py b/tests/ops/service/privacy_request/test_request_runner_service.py index 728302da3c..ba5884c225 100644 --- a/tests/ops/service/privacy_request/test_request_runner_service.py +++ b/tests/ops/service/privacy_request/test_request_runner_service.py @@ -349,7 +349,7 @@ def test_upload_access_results_has_data_category_field_mapping( args, kwargs = upload_mock.call_args data_category_field_mapping = kwargs["data_category_field_mapping"] - # make sure the catergory field mapping generally looks as we expect + # make sure the category field mapping generally looks as we expect address_mapping = data_category_field_mapping[ CollectionAddress.from_string("postgres_example_test_dataset:address") ]