Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow environment variables in schema definition #38

Merged
merged 4 commits into from
Dec 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions docs/schema.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,32 @@ This is useful if you need to anonymize one or more specific records, eg for "Ri
provider:
name: clear

YAML schema file supports placeholders with environment variables, ex:

`!ENV ${HOST}``

`!ENV '/var/${LOG_PATH}'`

So you can construct dynamic filter conditions like:
.. code-block:: sh
$ export COMPANY_ID=123

$ export ACTION_TO_BE_TAKEN=clear

$ pganonymize


***Example**::

- login:
search: id = '!ENV ${COMPANY_ID}'
search2: id = ${COMPANY_ID}
search3: username = '${USER_TO_BE_SEARCHED}'
fields:
- first_name:
provider:
name: ${ACTION_TO_BE_TAKEN}

``chunk_size``
~~~~~~~~~~~~~~

Expand Down
6 changes: 2 additions & 4 deletions pganonymizer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@
import logging
import time

import yaml

from pganonymizer.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE
from pganonymizer.providers import provider_registry
from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables
from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, load_config, truncate_tables


def get_pg_args(args):
Expand Down Expand Up @@ -64,7 +62,7 @@ def main(args):
list_provider_classes()
return 0

schema = yaml.load(open(args.schema), Loader=yaml.FullLoader)
schema = load_config(args.schema)

pg_args = get_pg_args(args)
connection = get_connection(pg_args)
Expand Down
35 changes: 34 additions & 1 deletion pganonymizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import logging
import math
import os
import re
import subprocess
import time
Expand All @@ -15,6 +16,7 @@
from pgcopy import CopyManager
from psycopg2.sql import SQL, Composed, Identifier
from tqdm import trange
import yaml

from pganonymizer.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY
from pganonymizer.providers import provider_registry
Expand Down Expand Up @@ -82,6 +84,7 @@ def build_and_then_import_data(connection, table, primary_key, columns,
sql_select = Composed([sql_select, SQL(" WHERE {search_condition}".format(search_condition=search))])
if dry_run:
sql_select = Composed([sql_select, SQL(" LIMIT 100")])
logging.info(sql_select.as_string(connection))
cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name='fetch_large_result')
cursor.execute(sql_select.as_string(connection))
temp_table = 'tmp_{table}'.format(table=table)
Expand Down Expand Up @@ -155,7 +158,7 @@ def create_temporary_table(connection, definitions, source_table, temp_table, pr
FROM {source_table} WITH NO DATA""")
cursor = connection.cursor()
cursor.execute(ctas_query.format(temp_table=Identifier(temp_table),
source_table=Identifier(source_table), columns=sql_columns)
source_table=Identifier(source_table), columns=sql_columns)
.as_string(connection)
)
cursor.close()
Expand Down Expand Up @@ -350,3 +353,33 @@ def nested_set(dic, path, value, delimiter='.'):
for key in keys[:-1]:
dic = dic.get(key, {})
dic[keys[-1]] = value


def load_config(schema):
# Original code from here https://gist.github.com/mkaranasou/ba83e25c835a8f7629e34dd7ede01931
tag = '!ENV'
pattern = re.compile(r'.*?\${(\w+)}.*?')
custom_loader = yaml.FullLoader
custom_loader.add_implicit_resolver(tag, pattern, None)

def constructor_env_variables(loader, node):
"""
Extracts the environment variable from the node's value
:param yaml.Loader loader: the yaml loader
:param node: the current node in the yaml
:return: the parsed string that contains the value of the environment
variable
"""
value = loader.construct_scalar(node)
match = pattern.findall(value) # to find all env variables in line
if match:
full_value = value
for g in match:
full_value = full_value.replace(
f'${{{g}}}', os.environ.get(g, g)
)
return full_value
return value

custom_loader.add_constructor(tag, constructor_env_variables)
return yaml.load(open(schema), Loader=custom_loader)
13 changes: 13 additions & 0 deletions tests/schemes/schema_with_env_variables.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
primary_key: !ENV ${TEST_PRIMARY_KEY}
primary_key2: !ENV ${TEST_PRIMARY_KEY}
chunk_size: !ENV ${TEST_CHUNK_SIZE}
concat_missing: !ENV 'Hello, ${MISSING_ENV_VAL}'
concat_missing2: 'Hello, ${MISSING_ENV_VAL}'
concat_present: !ENV 'Hello, ${PRESENT_WORLD_NAME}'
concat_present2: ${PRESENT_WORLD_NAME}
concat_present3: Hello, ${PRESENT_WORLD_NAME}
search: id = ${COMPANY_ID}
search2: username = '${USER_TO_BE_SEARCHED}'
corrupted: username = '${CORRUPTED
corrupted2: !ENV
corrupted3: !ENV $
42 changes: 41 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import math
import os
from collections import OrderedDict, namedtuple
from unittest import mock

import pytest
from mock import ANY, Mock, call, patch

from tests.utils import quote_ident

from pganonymizer.utils import (anonymize_tables, build_and_then_import_data, create_database_dump,
get_column_values, get_connection, import_data, truncate_tables)
get_column_values, get_connection, import_data, load_config, truncate_tables)


class TestGetConnection:
Expand Down Expand Up @@ -236,3 +238,41 @@ def test(self, mock_call):
create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432})
mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz',
shell=True)


class TestConfigLoader:

@pytest.mark.parametrize('file, envs, expected', [
['./tests/schemes/valid_schema.yml', {}, {
'tables': [{'auth_user': {'primary_key': 'id', 'chunk_size': 5000, 'fields': [
{'first_name': {'provider': {'name': 'fake.first_name'}}},
{'last_name': {'provider': {'name': 'set', 'value': 'Bar'}}},
{'email': {'provider': {'name': 'md5'}, 'append': '@localhost'}}
], 'excludes': [{'email': ['\\S[^@]*@example\\.com']}]}}], 'truncate': ['django_session']}],
['./tests/schemes/schema_with_env_variables.yml', {
"TEST_CHUNK_SIZE": "123",
"TEST_PRIMARY_KEY": "foo-bar",
"PRESENT_WORLD_NAME": "beautiful world",
"COMPANY_ID": "42",
"USER_TO_BE_SEARCHED": "i wanna be forgotten",
}, {
'primary_key': 'foo-bar',
'primary_key2': 'foo-bar',
'chunk_size': '123',
'concat_missing': 'Hello, MISSING_ENV_VAL',
'concat_missing2': 'Hello, ${MISSING_ENV_VAL}',
'concat_present': 'Hello, beautiful world',
'concat_present2': 'beautiful world',
'concat_present3': 'Hello, beautiful world',
'search': 'id = 42',
'search2': "username = 'i wanna be forgotten'",
'corrupted': "username = '${CORRUPTED",
'corrupted2': '',
'corrupted3': '$'
}
]
])
def test(self, file, envs, expected):
with mock.patch.dict(os.environ, envs):
print(load_config(file))
assert load_config(file) == expected