tweaks based on code review

ethyca · Jun 28, 2023 · 8565ca3 · 8565ca3
1 parent 87edf48
commit 8565ca3
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 6 deletions.
diff --git a/.fides/redis_dataset.yml b/.fides/redis_dataset.yml
@@ -20,6 +20,17 @@ dataset:
                     data_categories: [system.operations]
                     fidesops_meta:
                       data_type: string[]  # List of edges between the upstream collection and the current collection
+          - name: EN_DATA_USE_MAP__<privacy_request_id>
+            description: This map of traversed `Collection`s to associated `DataUse`s is stored and retrieved to be included in access request output packages.
+            data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified
+            fidesops_meta:
+              data_type: object # Dict mapping `Collection` addresses -> set of associated `DataUse`s
+            fields:
+              - name: <dataset_name>:<collection_name>  # `Collection` address
+                data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified
+                data_categories: [system.operations]
+                fidesops_meta:
+                  data_type: string[] # set of `DataUse`s associated with this `Collection`
           - name: EN_EMAIL_INFORMATION__<privacy_request_id>__<current_step>__<dataset_name>__<collection_name>  # Usage: For building emails associated with email-connector datasets at the end of the privacy request. This encrypted raw information is retrieved from each relevant email-based collection and used to build a single email per email connector, with instructions on how to mask data on the given dataset.
             fidesops_meta:
               data_type: object  # Stores how to locate and mask records for a given "email" collection.

diff --git a/src/fides/api/service/storage/storage_uploader_service.py b/src/fides/api/service/storage/storage_uploader_service.py
@@ -98,7 +98,7 @@ def _s3_uploader(
     auth_method = config.details[StorageDetails.AUTH_METHOD.value]
 
     return upload_to_s3(
-        config.secrets, data, bucket_name, file_key, config.format.value, privacy_request, auth_method, data_category_field_mapping  # type: ignore
+        config.secrets, data, bucket_name, file_key, config.format.value, privacy_request, auth_method, data_category_field_mapping, data_use_map  # type: ignore
     )
 
 
@@ -112,4 +112,4 @@ def _local_uploader(
 ) -> str:
     """Uploads data to local storage, used for quick-start/demo purposes"""
     file_key: str = _construct_file_key(privacy_request.id, config)
-    return upload_to_local(data, file_key, privacy_request, config.format.value, data_category_field_mapping)  # type: ignore
+    return upload_to_local(data, file_key, privacy_request, config.format.value, data_category_field_mapping, data_use_map)  # type: ignore
diff --git a/src/fides/api/task/graph_task.py b/src/fides/api/task/graph_task.py
@@ -671,15 +671,18 @@ def _format_data_use_map_for_caching(
 ) -> Dict[str, Set[str]]:
     """
     Create a map of `Collection`s mapped to their associated `DataUse`s
-    to be stored in the cache.
+    to be stored in the cache. This is done before request execution, so that we
+    maintain the _original_ state of the graph as it's used for request execution.
+    The graph is subject to change "from underneath" the request execution runtime,
+    but we want to avoid picking up those changes in our data use map.
 
     `DataUse`s are associated with a `Collection` by means of the `System`
     that's linked to a `Collection`'s `Connection` definition.
 
     Example:
     {
-       <collection1>: ["data_use_1", "data_use_2"],
-       <collection2>: ["data_use_1"],
+       <collection1>: {"data_use_1", "data_use_2"},
+       <collection2>: {"data_use_1"},
     }
     """
     return {collection.value: g_task.data_uses for collection, g_task in env.items()}
@@ -742,7 +745,14 @@ def termination_fn(
                 privacy_request, env, end_nodes, resources, ActionType.access
             )
         )
+
+        # cache access graph for use in logging/analytics event
         privacy_request.cache_access_graph(format_graph_for_caching(env, end_nodes))
+
+        # cache a map of collections -> data uses for the output package of access requests
+        # this is cached here before request execution, since this is the state of the
+        # graph used for request execution. the graph could change _during_ request execution,
+        # but we don't want those changes in our data use map.
         privacy_request.cache_data_use_map(_format_data_use_map_for_caching(env))
 
         v = delayed(get(dsk, TERMINATOR_ADDRESS, num_workers=1))

diff --git a/tests/ops/service/privacy_request/test_request_runner_service.py b/tests/ops/service/privacy_request/test_request_runner_service.py
@@ -349,7 +349,7 @@ def test_upload_access_results_has_data_category_field_mapping(
     args, kwargs = upload_mock.call_args
     data_category_field_mapping = kwargs["data_category_field_mapping"]
 
-    # make sure the catergory field mapping generally looks as we expect
+    # make sure the category field mapping generally looks as we expect
     address_mapping = data_category_field_mapping[
         CollectionAddress.from_string("postgres_example_test_dataset:address")
     ]