From 1ad1c834f3dca6523ca68347b90f70dd34d1daa1 Mon Sep 17 00:00:00 2001 From: Drew Banin Date: Sat, 26 Jun 2021 12:39:16 -0400 Subject: [PATCH] (#2984) Prevent Agate from coercing values in query result sets --- CHANGELOG.md | 1 + core/dbt/clients/agate_helper.py | 45 ++++++++++++++----- test/integration/100_rpc_test/sql/generic.sql | 29 ++++++++++++ .../test_execute_fetch_and_serialize.py | 21 +++++++++ test/unit/test_agate_helper.py | 21 +++++++++ 5 files changed, 107 insertions(+), 10 deletions(-) create mode 100644 test/integration/100_rpc_test/sql/generic.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index f9206d2545e..c4cc7bd162e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Contributors: ### Fixes - Handle quoted values within test configs, such as `where` ([#3458](https://github.com/fishtown-analytics/dbt/issues/3458), [#3459](https://github.com/fishtown-analytics/dbt/pull/3459)) +- Fix type coercion issues when fetching query result sets ([#2984](https://github.com/fishtown-analytics/dbt/issues/2984), [#3499](https://github.com/fishtown-analytics/dbt/pull/3499)) ### Under the hood diff --git a/core/dbt/clients/agate_helper.py b/core/dbt/clients/agate_helper.py index 29f285edac3..d2f9298df48 100644 --- a/core/dbt/clients/agate_helper.py +++ b/core/dbt/clients/agate_helper.py @@ -35,7 +35,11 @@ def cast(self, d): ) -def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester: +def build_type_tester( + text_columns: Iterable[str], + string_null_values: Optional[Iterable[str]] = ('null', '') +) -> agate.TypeTester: + types = [ agate.data_types.Number(null_values=('null', '')), agate.data_types.Date(null_values=('null', ''), @@ -46,10 +50,10 @@ def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester: agate.data_types.Boolean(true_values=('true',), false_values=('false',), null_values=('null', '')), - agate.data_types.Text(null_values=('null', '')) + agate.data_types.Text(null_values=string_null_values) ] force = { - k: agate.data_types.Text(null_values=('null', '')) + k: agate.data_types.Text(null_values=string_null_values) for k in text_columns } return agate.TypeTester(force=force, types=types) @@ -66,7 +70,13 @@ def table_from_rows( if text_only_columns is None: column_types = DEFAULT_TYPE_TESTER else: - column_types = build_type_tester(text_only_columns) + # If text_only_columns are present, prevent coercing empty string or + # literal 'null' strings to a None representation. + column_types = build_type_tester( + text_only_columns, + string_null_values=() + ) + return agate.Table(rows, column_names, column_types=column_types) @@ -86,19 +96,34 @@ def table_from_data(data, column_names: Iterable[str]) -> agate.Table: def table_from_data_flat(data, column_names: Iterable[str]) -> agate.Table: - "Convert list of dictionaries into an Agate table" + """ + Convert a list of dictionaries into an Agate table. This method does not + coerce string values into more specific types (eg. '005' will not be + coerced to '5'). Additionally, this method does not coerce values to + None (eg. '' or 'null' will retain their string literal representations). + """ rows = [] + text_only_columns = set() for _row in data: row = [] - for value in list(_row.values()): + for col_name in column_names: + value = _row[col_name] if isinstance(value, (dict, list, tuple)): - row.append(json.dumps(value, cls=dbt.utils.JSONEncoder)) - else: - row.append(value) + # Represent container types as json strings + value = json.dumps(value, cls=dbt.utils.JSONEncoder) + text_only_columns.add(col_name) + elif isinstance(value, str): + text_only_columns.add(col_name) + row.append(value) + rows.append(row) - return table_from_rows(rows=rows, column_names=column_names) + return table_from_rows( + rows=rows, + column_names=column_names, + text_only_columns=text_only_columns + ) def empty_table(): diff --git a/test/integration/100_rpc_test/sql/generic.sql b/test/integration/100_rpc_test/sql/generic.sql new file mode 100644 index 00000000000..5e00cdc46f8 --- /dev/null +++ b/test/integration/100_rpc_test/sql/generic.sql @@ -0,0 +1,29 @@ + +/* + Every column returned by this query should be interpreted + as a string. If any result is coerced into a non-string (eg. + an int, float, bool, date, NoneType, etc) then dbt did the + wrong thing +*/ + +select + '' as str_empty_string, + + 'null' as str_null, + + '1' as str_int, + '00005' as str_int_2, + '00' as str_int_3, + + '1.1' as str_float, + '00001.1' as str_float_2, + + 'true' as str_bool, + 'True' as str_bool_2, + + '2021-01-01' as str_date, + '2021-01-01T12:00:00Z' as str_datetime, + + -- this is obviously not a date... but Agate used to think it was! + -- see: https://github.com/fishtown-analytics/dbt/issues/2984 + '0010T00000aabbccdd' as str_obviously_not_date diff --git a/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py b/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py index d5fa2ba76fe..6c4a73eaf74 100644 --- a/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py +++ b/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py @@ -39,6 +39,12 @@ def do_test_file(self, filename): self.assertTrue(len(table.rows) > 0, "agate table had no rows") self.do_test_pickle(table) + return table + + def assert_all_columns_are_strings(self, table): + for row in table: + for value in row: + self.assertEqual(type(value), str, f'Found a not-string: {value} in row {row}') @use_profile('bigquery') def test__bigquery_fetch_and_serialize(self): @@ -51,3 +57,18 @@ def test__snowflake_fetch_and_serialize(self): @use_profile('redshift') def test__redshift_fetch_and_serialize(self): self.do_test_file('redshift.sql') + + @use_profile('bigquery') + def test__bigquery_type_coercion(self): + table = self.do_test_file('generic.sql') + self.assert_all_columns_are_strings(table) + + @use_profile('snowflake') + def test__snowflake_type_coercion(self): + table = self.do_test_file('generic.sql') + self.assert_all_columns_are_strings(table) + + @use_profile('redshift') + def test__redshift_type_coercion(self): + table = self.do_test_file('generic.sql') + self.assert_all_columns_are_strings(table) diff --git a/test/unit/test_agate_helper.py b/test/unit/test_agate_helper.py index 8f15ff75984..6add176c601 100644 --- a/test/unit/test_agate_helper.py +++ b/test/unit/test_agate_helper.py @@ -133,3 +133,24 @@ def test_merge_mixed(self): assert isinstance(result.column_types[1], agate.data_types.Text) assert isinstance(result.column_types[2], agate.data_types.Text) self.assertEqual(len(result), 6) + + def test_nocast_string_types(self): + # String fields should not be coerced into a representative type + # See: https://github.com/fishtown-analytics/dbt/issues/2984 + + column_names = ['a', 'b', 'c', 'd', 'e'] + result_set = [ + {'a': '0005', 'b': '01T00000aabbccdd', 'c': 'true', 'd': 10, 'e': False}, + {'a': '0006', 'b': '01T00000aabbccde', 'c': 'false', 'd': 11, 'e': True}, + ] + + tbl = agate_helper.table_from_data_flat(data=result_set, column_names=column_names) + self.assertEqual(len(tbl), len(result_set)) + + expected = [ + ['0005', '01T00000aabbccdd', 'true', Decimal(10), False], + ['0006', '01T00000aabbccde', 'false', Decimal(11), True], + ] + + for i, row in enumerate(tbl): + self.assertEqual(list(row), expected[i])