diff --git a/docs/schema.rst b/docs/schema.rst index ee841db..1706013 100644 --- a/docs/schema.rst +++ b/docs/schema.rst @@ -114,6 +114,32 @@ This is useful if you need to anonymize one or more specific records, eg for "Ri provider: name: clear +YAML schema file supports placeholders with environment variables, ex: + +`!ENV ${HOST}`` + +`!ENV '/var/${LOG_PATH}'` + +So you can construct dynamic filter conditions like: +.. code-block:: sh + $ export COMPANY_ID=123 + + $ export ACTION_TO_BE_TAKEN=clear + + $ pganonymize + + +***Example**:: + + - login: + search: id = '!ENV ${COMPANY_ID}' + search2: id = ${COMPANY_ID} + search3: username = '${USER_TO_BE_SEARCHED}' + fields: + - first_name: + provider: + name: ${ACTION_TO_BE_TAKEN} + ``chunk_size`` ~~~~~~~~~~~~~~ diff --git a/pganonymizer/cli.py b/pganonymizer/cli.py index d05cd2f..bf1f60a 100644 --- a/pganonymizer/cli.py +++ b/pganonymizer/cli.py @@ -6,11 +6,9 @@ import logging import time -import yaml - from pganonymizer.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE from pganonymizer.providers import provider_registry -from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables +from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, load_config, truncate_tables def get_pg_args(args): @@ -64,7 +62,7 @@ def main(args): list_provider_classes() return 0 - schema = yaml.load(open(args.schema), Loader=yaml.FullLoader) + schema = load_config(args.schema) pg_args = get_pg_args(args) connection = get_connection(pg_args) diff --git a/pganonymizer/utils.py b/pganonymizer/utils.py index 188bce5..ce795a6 100644 --- a/pganonymizer/utils.py +++ b/pganonymizer/utils.py @@ -5,6 +5,7 @@ import json import logging import math +import os import re import subprocess import time @@ -15,6 +16,7 @@ from pgcopy import CopyManager from psycopg2.sql import SQL, Composed, Identifier from tqdm import trange +import yaml from pganonymizer.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY from pganonymizer.providers import provider_registry @@ -82,6 +84,7 @@ def build_and_then_import_data(connection, table, primary_key, columns, sql_select = Composed([sql_select, SQL(" WHERE {search_condition}".format(search_condition=search))]) if dry_run: sql_select = Composed([sql_select, SQL(" LIMIT 100")]) + logging.info(sql_select.as_string(connection)) cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name='fetch_large_result') cursor.execute(sql_select.as_string(connection)) temp_table = 'tmp_{table}'.format(table=table) @@ -155,7 +158,7 @@ def create_temporary_table(connection, definitions, source_table, temp_table, pr FROM {source_table} WITH NO DATA""") cursor = connection.cursor() cursor.execute(ctas_query.format(temp_table=Identifier(temp_table), - source_table=Identifier(source_table), columns=sql_columns) + source_table=Identifier(source_table), columns=sql_columns) .as_string(connection) ) cursor.close() @@ -350,3 +353,33 @@ def nested_set(dic, path, value, delimiter='.'): for key in keys[:-1]: dic = dic.get(key, {}) dic[keys[-1]] = value + + +def load_config(schema): + # Original code from here https://gist.github.com/mkaranasou/ba83e25c835a8f7629e34dd7ede01931 + tag = '!ENV' + pattern = re.compile(r'.*?\${(\w+)}.*?') + custom_loader = yaml.FullLoader + custom_loader.add_implicit_resolver(tag, pattern, None) + + def constructor_env_variables(loader, node): + """ + Extracts the environment variable from the node's value + :param yaml.Loader loader: the yaml loader + :param node: the current node in the yaml + :return: the parsed string that contains the value of the environment + variable + """ + value = loader.construct_scalar(node) + match = pattern.findall(value) # to find all env variables in line + if match: + full_value = value + for g in match: + full_value = full_value.replace( + f'${{{g}}}', os.environ.get(g, g) + ) + return full_value + return value + + custom_loader.add_constructor(tag, constructor_env_variables) + return yaml.load(open(schema), Loader=custom_loader) diff --git a/tests/schemes/schema_with_env_variables.yml b/tests/schemes/schema_with_env_variables.yml new file mode 100644 index 0000000..efa0da4 --- /dev/null +++ b/tests/schemes/schema_with_env_variables.yml @@ -0,0 +1,13 @@ +primary_key: !ENV ${TEST_PRIMARY_KEY} +primary_key2: !ENV ${TEST_PRIMARY_KEY} +chunk_size: !ENV ${TEST_CHUNK_SIZE} +concat_missing: !ENV 'Hello, ${MISSING_ENV_VAL}' +concat_missing2: 'Hello, ${MISSING_ENV_VAL}' +concat_present: !ENV 'Hello, ${PRESENT_WORLD_NAME}' +concat_present2: ${PRESENT_WORLD_NAME} +concat_present3: Hello, ${PRESENT_WORLD_NAME} +search: id = ${COMPANY_ID} +search2: username = '${USER_TO_BE_SEARCHED}' +corrupted: username = '${CORRUPTED +corrupted2: !ENV +corrupted3: !ENV $ diff --git a/tests/test_utils.py b/tests/test_utils.py index e274856..981557b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,7 @@ import math +import os from collections import OrderedDict, namedtuple +from unittest import mock import pytest from mock import ANY, Mock, call, patch @@ -7,7 +9,7 @@ from tests.utils import quote_ident from pganonymizer.utils import (anonymize_tables, build_and_then_import_data, create_database_dump, - get_column_values, get_connection, import_data, truncate_tables) + get_column_values, get_connection, import_data, load_config, truncate_tables) class TestGetConnection: @@ -236,3 +238,41 @@ def test(self, mock_call): create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432}) mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz', shell=True) + + +class TestConfigLoader: + + @pytest.mark.parametrize('file, envs, expected', [ + ['./tests/schemes/valid_schema.yml', {}, { + 'tables': [{'auth_user': {'primary_key': 'id', 'chunk_size': 5000, 'fields': [ + {'first_name': {'provider': {'name': 'fake.first_name'}}}, + {'last_name': {'provider': {'name': 'set', 'value': 'Bar'}}}, + {'email': {'provider': {'name': 'md5'}, 'append': '@localhost'}} + ], 'excludes': [{'email': ['\\S[^@]*@example\\.com']}]}}], 'truncate': ['django_session']}], + ['./tests/schemes/schema_with_env_variables.yml', { + "TEST_CHUNK_SIZE": "123", + "TEST_PRIMARY_KEY": "foo-bar", + "PRESENT_WORLD_NAME": "beautiful world", + "COMPANY_ID": "42", + "USER_TO_BE_SEARCHED": "i wanna be forgotten", + }, { + 'primary_key': 'foo-bar', + 'primary_key2': 'foo-bar', + 'chunk_size': '123', + 'concat_missing': 'Hello, MISSING_ENV_VAL', + 'concat_missing2': 'Hello, ${MISSING_ENV_VAL}', + 'concat_present': 'Hello, beautiful world', + 'concat_present2': 'beautiful world', + 'concat_present3': 'Hello, beautiful world', + 'search': 'id = 42', + 'search2': "username = 'i wanna be forgotten'", + 'corrupted': "username = '${CORRUPTED", + 'corrupted2': '', + 'corrupted3': '$' + } + ] + ]) + def test(self, file, envs, expected): + with mock.patch.dict(os.environ, envs): + print(load_config(file)) + assert load_config(file) == expected