Skip to content

Commit

Permalink
Merge pull request #38 from yojee/add_envs
Browse files Browse the repository at this point in the history
Allow environment variables in schema definition
  • Loading branch information
hkage authored Dec 13, 2021
2 parents 14228ee + f91aaa7 commit 5fd4953
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 6 deletions.
26 changes: 26 additions & 0 deletions docs/schema.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,32 @@ This is useful if you need to anonymize one or more specific records, eg for "Ri
provider:
name: clear

YAML schema file supports placeholders with environment variables, ex:

`!ENV ${HOST}``

`!ENV '/var/${LOG_PATH}'`

So you can construct dynamic filter conditions like:
.. code-block:: sh
$ export COMPANY_ID=123
$ export ACTION_TO_BE_TAKEN=clear
$ pganonymize
***Example**::

- login:
search: id = '!ENV ${COMPANY_ID}'
search2: id = ${COMPANY_ID}
search3: username = '${USER_TO_BE_SEARCHED}'
fields:
- first_name:
provider:
name: ${ACTION_TO_BE_TAKEN}

``chunk_size``
~~~~~~~~~~~~~~

Expand Down
6 changes: 2 additions & 4 deletions pganonymizer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@
import logging
import time

import yaml

from pganonymizer.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE
from pganonymizer.providers import provider_registry
from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables
from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, load_config, truncate_tables


def get_pg_args(args):
Expand Down Expand Up @@ -64,7 +62,7 @@ def main(args):
list_provider_classes()
return 0

schema = yaml.load(open(args.schema), Loader=yaml.FullLoader)
schema = load_config(args.schema)

pg_args = get_pg_args(args)
connection = get_connection(pg_args)
Expand Down
35 changes: 34 additions & 1 deletion pganonymizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import logging
import math
import os
import re
import subprocess
import time
Expand All @@ -15,6 +16,7 @@
from pgcopy import CopyManager
from psycopg2.sql import SQL, Composed, Identifier
from tqdm import trange
import yaml

from pganonymizer.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY
from pganonymizer.providers import provider_registry
Expand Down Expand Up @@ -82,6 +84,7 @@ def build_and_then_import_data(connection, table, primary_key, columns,
sql_select = Composed([sql_select, SQL(" WHERE {search_condition}".format(search_condition=search))])
if dry_run:
sql_select = Composed([sql_select, SQL(" LIMIT 100")])
logging.info(sql_select.as_string(connection))
cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name='fetch_large_result')
cursor.execute(sql_select.as_string(connection))
temp_table = 'tmp_{table}'.format(table=table)
Expand Down Expand Up @@ -155,7 +158,7 @@ def create_temporary_table(connection, definitions, source_table, temp_table, pr
FROM {source_table} WITH NO DATA""")
cursor = connection.cursor()
cursor.execute(ctas_query.format(temp_table=Identifier(temp_table),
source_table=Identifier(source_table), columns=sql_columns)
source_table=Identifier(source_table), columns=sql_columns)
.as_string(connection)
)
cursor.close()
Expand Down Expand Up @@ -350,3 +353,33 @@ def nested_set(dic, path, value, delimiter='.'):
for key in keys[:-1]:
dic = dic.get(key, {})
dic[keys[-1]] = value


def load_config(schema):
# Original code from here https://gist.github.com/mkaranasou/ba83e25c835a8f7629e34dd7ede01931
tag = '!ENV'
pattern = re.compile(r'.*?\${(\w+)}.*?')
custom_loader = yaml.FullLoader
custom_loader.add_implicit_resolver(tag, pattern, None)

def constructor_env_variables(loader, node):
"""
Extracts the environment variable from the node's value
:param yaml.Loader loader: the yaml loader
:param node: the current node in the yaml
:return: the parsed string that contains the value of the environment
variable
"""
value = loader.construct_scalar(node)
match = pattern.findall(value) # to find all env variables in line
if match:
full_value = value
for g in match:
full_value = full_value.replace(
f'${{{g}}}', os.environ.get(g, g)
)
return full_value
return value

custom_loader.add_constructor(tag, constructor_env_variables)
return yaml.load(open(schema), Loader=custom_loader)
13 changes: 13 additions & 0 deletions tests/schemes/schema_with_env_variables.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
primary_key: !ENV ${TEST_PRIMARY_KEY}
primary_key2: !ENV ${TEST_PRIMARY_KEY}
chunk_size: !ENV ${TEST_CHUNK_SIZE}
concat_missing: !ENV 'Hello, ${MISSING_ENV_VAL}'
concat_missing2: 'Hello, ${MISSING_ENV_VAL}'
concat_present: !ENV 'Hello, ${PRESENT_WORLD_NAME}'
concat_present2: ${PRESENT_WORLD_NAME}
concat_present3: Hello, ${PRESENT_WORLD_NAME}
search: id = ${COMPANY_ID}
search2: username = '${USER_TO_BE_SEARCHED}'
corrupted: username = '${CORRUPTED
corrupted2: !ENV
corrupted3: !ENV $
42 changes: 41 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import math
import os
from collections import OrderedDict, namedtuple
from unittest import mock

import pytest
from mock import ANY, Mock, call, patch

from tests.utils import quote_ident

from pganonymizer.utils import (anonymize_tables, build_and_then_import_data, create_database_dump,
get_column_values, get_connection, import_data, truncate_tables)
get_column_values, get_connection, import_data, load_config, truncate_tables)


class TestGetConnection:
Expand Down Expand Up @@ -236,3 +238,41 @@ def test(self, mock_call):
create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432})
mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz',
shell=True)


class TestConfigLoader:

@pytest.mark.parametrize('file, envs, expected', [
['./tests/schemes/valid_schema.yml', {}, {
'tables': [{'auth_user': {'primary_key': 'id', 'chunk_size': 5000, 'fields': [
{'first_name': {'provider': {'name': 'fake.first_name'}}},
{'last_name': {'provider': {'name': 'set', 'value': 'Bar'}}},
{'email': {'provider': {'name': 'md5'}, 'append': '@localhost'}}
], 'excludes': [{'email': ['\\S[^@]*@example\\.com']}]}}], 'truncate': ['django_session']}],
['./tests/schemes/schema_with_env_variables.yml', {
"TEST_CHUNK_SIZE": "123",
"TEST_PRIMARY_KEY": "foo-bar",
"PRESENT_WORLD_NAME": "beautiful world",
"COMPANY_ID": "42",
"USER_TO_BE_SEARCHED": "i wanna be forgotten",
}, {
'primary_key': 'foo-bar',
'primary_key2': 'foo-bar',
'chunk_size': '123',
'concat_missing': 'Hello, MISSING_ENV_VAL',
'concat_missing2': 'Hello, ${MISSING_ENV_VAL}',
'concat_present': 'Hello, beautiful world',
'concat_present2': 'beautiful world',
'concat_present3': 'Hello, beautiful world',
'search': 'id = 42',
'search2': "username = 'i wanna be forgotten'",
'corrupted': "username = '${CORRUPTED",
'corrupted2': '',
'corrupted3': '$'
}
]
])
def test(self, file, envs, expected):
with mock.patch.dict(os.environ, envs):
print(load_config(file))
assert load_config(file) == expected

0 comments on commit 5fd4953

Please sign in to comment.