Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include data_use and data_category metadata in upload of access results #3674

Merged
merged 6 commits into from
Jun 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .fides/redis_dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@ dataset:
data_categories: [system.operations]
fidesops_meta:
data_type: string[] # List of edges between the upstream collection and the current collection
- name: EN_DATA_USE_MAP__<privacy_request_id>
description: This map of traversed `Collection`s to associated `DataUse`s is stored and retrieved to be included in access request output packages.
data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified
fidesops_meta:
data_type: object # Dict mapping `Collection` addresses -> set of associated `DataUse`s
fields:
- name: <dataset_name>:<collection_name> # `Collection` address
data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified
data_categories: [system.operations]
fidesops_meta:
data_type: string[] # set of `DataUse`s associated with this `Collection`
- name: EN_EMAIL_INFORMATION__<privacy_request_id>__<current_step>__<dataset_name>__<collection_name> # Usage: For building emails associated with email-connector datasets at the end of the privacy request. This encrypted raw information is retrieved from each relevant email-based collection and used to build a single email per email connector, with instructions on how to mask data on the given dataset.
fidesops_meta:
data_type: object # Stores how to locate and mask records for a given "email" collection.
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ The types of changes are:
### Added
- Empty state for when there are no relevant privacy notices in the privacy center [#3640](https://github.com/ethyca/fides/pull/3640)
- Set `sslmode` to `prefer` if connecting to Redshift via ssh [#3685](https://github.com/ethyca/fides/pull/3685)
- Include `data_use` and `data_category` metadata in `upload` of access results [#3674](https://github.com/ethyca/fides/pull/3674)

### Fixed
- Render linebreaks in the Fides.js overlay descriptions, etc. [#3665](https://github.com/ethyca/fides/pull/3665)
Expand Down
4 changes: 3 additions & 1 deletion src/fides/api/graph/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
SeedAddress,
)

DataCategoryFieldMapping = Dict[CollectionAddress, Dict[FidesKey, List[FieldPath]]]
adamsachs marked this conversation as resolved.
Show resolved Hide resolved


class Node:
"""A traversal_node represents a single collection as a graph traversal_node.
Expand Down Expand Up @@ -229,7 +231,7 @@ def __init__(self, *datasets: GraphDataset) -> None:
@property
def data_category_field_mapping(
self,
) -> Dict[CollectionAddress, Dict[FidesKey, List[FieldPath]]]:
) -> DataCategoryFieldMapping:
"""
Maps the data_categories for each traversal_node to a list of field paths that have that
same data category.
Expand Down
20 changes: 19 additions & 1 deletion src/fides/api/models/privacy_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import json
from datetime import datetime, timedelta
from enum import Enum as EnumType
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Set, Union

from celery.result import AsyncResult
from loguru import logger
Expand Down Expand Up @@ -660,6 +660,24 @@ def get_cached_access_graph(self) -> Optional[GraphRepr]:
] = cache.get_encoded_objects_by_prefix(f"ACCESS_GRAPH__{self.id}")
return list(value_dict.values())[0] if value_dict else None

def cache_data_use_map(self, value: Dict[str, Set[str]]) -> None:
"""
Cache a dict of collections traversed in the privacy request
mapped to their associated data uses
"""
cache: FidesopsRedis = get_cache()
cache.set_encoded_object(f"DATA_USE_MAP__{self.id}", value)
pattisdr marked this conversation as resolved.
Show resolved Hide resolved

def get_cached_data_use_map(self) -> Optional[Dict[str, Set[str]]]:
"""
Fetch the collection -> data use map cached for this privacy request
"""
cache: FidesopsRedis = get_cache()
value_dict: Optional[
Dict[str, Optional[Dict[str, Set[str]]]]
] = cache.get_encoded_objects_by_prefix(f"DATA_USE_MAP__{self.id}")
return list(value_dict.values())[0] if value_dict else None

def trigger_policy_webhook(
self,
webhook: WebhookTypes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ def upload_access_results( # pylint: disable=R0912
privacy_request=privacy_request,
data=filtered_results,
storage_key=storage_destination.key, # type: ignore
data_category_field_mapping=dataset_graph.data_category_field_mapping,
data_use_map=privacy_request.get_cached_data_use_map(),
pattisdr marked this conversation as resolved.
Show resolved Hide resolved
)
if download_url:
download_urls.append(download_url)
Expand Down
32 changes: 25 additions & 7 deletions src/fides/api/service/storage/storage_uploader_service.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, Set

from fideslang.validation import FidesKey
from loguru import logger
from sqlalchemy.orm import Session

from fides.api.common_exceptions import StorageUploadError
from fides.api.graph.graph import DataCategoryFieldMapping
from fides.api.models.privacy_request import PrivacyRequest
from fides.api.models.storage import StorageConfig
from fides.api.schemas.storage.storage import (
Expand All @@ -17,7 +18,12 @@


def upload(
db: Session, *, privacy_request: PrivacyRequest, data: Dict, storage_key: FidesKey
db: Session,
privacy_request: PrivacyRequest,
data: Dict,
storage_key: FidesKey,
data_category_field_mapping: Optional[DataCategoryFieldMapping] = None,
data_use_map: Optional[Dict[str, Set[str]]] = None,
) -> str:
"""
Retrieves storage configs and calls appropriate upload method
Expand All @@ -35,7 +41,9 @@ def upload(
logger.warning("Storage type not found: {}", storage_key)
raise StorageUploadError(f"Storage type not found: {storage_key}")
uploader: Any = _get_uploader_from_config_type(config.type) # type: ignore
return uploader(db, config, data, privacy_request)
return uploader(
db, config, data, privacy_request, data_category_field_mapping, data_use_map
)


def get_extension(resp_format: ResponseFormat) -> str:
Expand Down Expand Up @@ -76,7 +84,12 @@ def _get_uploader_from_config_type(storage_type: StorageType) -> Any:


def _s3_uploader(
_: Session, config: StorageConfig, data: Dict, privacy_request: PrivacyRequest
_: Session,
config: StorageConfig,
data: Dict,
privacy_request: PrivacyRequest,
data_category_field_mapping: Optional[DataCategoryFieldMapping] = None,
data_use_map: Optional[Dict[str, Set[str]]] = None,
) -> str:
"""Constructs necessary info needed for s3 before calling upload"""
file_key: str = _construct_file_key(privacy_request.id, config)
Expand All @@ -85,13 +98,18 @@ def _s3_uploader(
auth_method = config.details[StorageDetails.AUTH_METHOD.value]

return upload_to_s3(
config.secrets, data, bucket_name, file_key, config.format.value, privacy_request, auth_method # type: ignore
config.secrets, data, bucket_name, file_key, config.format.value, privacy_request, auth_method, data_category_field_mapping, data_use_map # type: ignore
)


def _local_uploader(
_: Session, config: StorageConfig, data: Dict, privacy_request: PrivacyRequest
_: Session,
config: StorageConfig,
data: Dict,
privacy_request: PrivacyRequest,
data_category_field_mapping: Optional[DataCategoryFieldMapping] = None,
data_use_map: Optional[Dict[str, Set[str]]] = None,
) -> str:
"""Uploads data to local storage, used for quick-start/demo purposes"""
file_key: str = _construct_file_key(privacy_request.id, config)
return upload_to_local(data, file_key, privacy_request, config.format.value) # type: ignore
return upload_to_local(data, file_key, privacy_request, config.format.value, data_category_field_mapping, data_use_map) # type: ignore
38 changes: 38 additions & 0 deletions src/fides/api/task/graph_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from fides.api.models.connectionconfig import AccessLevel, ConnectionConfig
from fides.api.models.policy import Policy
from fides.api.models.privacy_request import ExecutionLogStatus, PrivacyRequest
from fides.api.models.sql_models import System # type: ignore[attr-defined]
from fides.api.schemas.policy import ActionType
from fides.api.service.connectors.base_connector import BaseConnector
from fides.api.task.consolidate_query_matches import consolidate_query_matches
Expand Down Expand Up @@ -174,6 +175,13 @@ def __init__(
self.connector: BaseConnector = resources.get_connector(
self.traversal_node.node.dataset.connection_key # ConnectionConfig.key
)
self.data_uses: Set[str] = (
System.get_data_uses(
[self.connector.configuration.system], include_parents=False
)
adamsachs marked this conversation as resolved.
Show resolved Hide resolved
if self.connector.configuration.system
else {}
)

# build incoming edges to the form : [dataset address: [(foreign field, local field)]
self.incoming_edges_by_collection: Dict[
Expand Down Expand Up @@ -658,6 +666,28 @@ def update_mapping_from_cache(
)


def _format_data_use_map_for_caching(
env: Dict[CollectionAddress, "GraphTask"]
) -> Dict[str, Set[str]]:
"""
Create a map of `Collection`s mapped to their associated `DataUse`s
to be stored in the cache. This is done before request execution, so that we
maintain the _original_ state of the graph as it's used for request execution.
The graph is subject to change "from underneath" the request execution runtime,
but we want to avoid picking up those changes in our data use map.

`DataUse`s are associated with a `Collection` by means of the `System`
that's linked to a `Collection`'s `Connection` definition.

Example:
{
<collection1>: {"data_use_1", "data_use_2"},
<collection2>: {"data_use_1"},
}
"""
return {collection.value: g_task.data_uses for collection, g_task in env.items()}


def start_function(seed: List[Dict[str, Any]]) -> Callable[[], List[Dict[str, Any]]]:
"""Return a function for collections with no upstream dependencies, that just start
with seed data.
Expand Down Expand Up @@ -715,8 +745,16 @@ def termination_fn(
privacy_request, env, end_nodes, resources, ActionType.access
)
)

# cache access graph for use in logging/analytics event
privacy_request.cache_access_graph(format_graph_for_caching(env, end_nodes))
pattisdr marked this conversation as resolved.
Show resolved Hide resolved

# cache a map of collections -> data uses for the output package of access requests
# this is cached here before request execution, since this is the state of the
# graph used for request execution. the graph could change _during_ request execution,
# but we don't want those changes in our data use map.
privacy_request.cache_data_use_map(_format_data_use_map_for_caching(env))
pattisdr marked this conversation as resolved.
Show resolved Hide resolved

v = delayed(get(dsk, TERMINATOR_ADDRESS, num_workers=1))
return v.compute()

Expand Down
7 changes: 6 additions & 1 deletion src/fides/api/tasks/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
import secrets
import zipfile
from io import BytesIO
from typing import Any, Dict, Union
from typing import Any, Dict, Optional, Set, Union

import pandas as pd
from boto3 import Session
from botocore.exceptions import ClientError, ParamValidationError
from loguru import logger

from fides.api.cryptography.cryptographic_util import bytes_to_b64_str
from fides.api.graph.graph import DataCategoryFieldMapping
from fides.api.models.privacy_request import PrivacyRequest
from fides.api.schemas.storage.storage import (
ResponseFormat,
Expand Down Expand Up @@ -133,6 +134,8 @@ def upload_to_s3( # pylint: disable=R0913
resp_format: str,
privacy_request: PrivacyRequest,
auth_method: S3AuthMethod,
data_category_field_mapping: Optional[DataCategoryFieldMapping] = None,
data_use_map: Optional[Dict[str, Set[str]]] = None,
) -> str:
"""Uploads arbitrary data to s3 returned from an access request"""
logger.info("Starting S3 Upload of {}", file_key)
Expand Down Expand Up @@ -171,6 +174,8 @@ def upload_to_local(
file_key: str,
privacy_request: PrivacyRequest,
resp_format: str = ResponseFormat.json.value,
data_category_field_mapping: Optional[DataCategoryFieldMapping] = None,
data_use_map: Optional[Dict[str, Set[str]]] = None,
) -> str:
"""Uploads access request data to a local folder - for testing/demo purposes only"""
if not os.path.exists(LOCAL_FIDES_UPLOAD_DIRECTORY):
Expand Down
3 changes: 3 additions & 0 deletions tests/fixtures/postgres_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
PrivacyRequest,
)
from fides.api.models.sql_models import Dataset as CtlDataset
from fides.api.models.sql_models import System
from fides.api.service.connectors import PostgreSQLConnector
from fides.config import CONFIG
from tests.ops.test_helpers.db_utils import seed_postgres_data
Expand Down Expand Up @@ -178,6 +179,7 @@ def disabled_connection_config(
@pytest.fixture(scope="function")
def read_connection_config(
db: Session,
system: System,
) -> Generator:
connection_config = ConnectionConfig.create(
db=db,
Expand All @@ -186,6 +188,7 @@ def read_connection_config(
"key": "my_postgres_db_1_read_config",
"connection_type": ConnectionType.postgres,
"access": AccessLevel.read,
"system_id": system.id,
"secrets": integration_secrets["postgres_example"],
"description": "Read-only connection config",
},
Expand Down
Loading