Skip to content

Commit

Permalink
(#2984) Prevent Agate from coercing values in query result sets
Browse files Browse the repository at this point in the history
  • Loading branch information
drewbanin committed Jun 26, 2021
1 parent 41610b8 commit 1ad1c83
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Contributors:
### Fixes

- Handle quoted values within test configs, such as `where` ([#3458](https://github.com/fishtown-analytics/dbt/issues/3458), [#3459](https://github.com/fishtown-analytics/dbt/pull/3459))
- Fix type coercion issues when fetching query result sets ([#2984](https://github.com/fishtown-analytics/dbt/issues/2984), [#3499](https://github.com/fishtown-analytics/dbt/pull/3499))

### Under the hood

Expand Down
45 changes: 35 additions & 10 deletions core/dbt/clients/agate_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ def cast(self, d):
)


def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
def build_type_tester(
text_columns: Iterable[str],
string_null_values: Optional[Iterable[str]] = ('null', '')
) -> agate.TypeTester:

types = [
agate.data_types.Number(null_values=('null', '')),
agate.data_types.Date(null_values=('null', ''),
Expand All @@ -46,10 +50,10 @@ def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
agate.data_types.Boolean(true_values=('true',),
false_values=('false',),
null_values=('null', '')),
agate.data_types.Text(null_values=('null', ''))
agate.data_types.Text(null_values=string_null_values)
]
force = {
k: agate.data_types.Text(null_values=('null', ''))
k: agate.data_types.Text(null_values=string_null_values)
for k in text_columns
}
return agate.TypeTester(force=force, types=types)
Expand All @@ -66,7 +70,13 @@ def table_from_rows(
if text_only_columns is None:
column_types = DEFAULT_TYPE_TESTER
else:
column_types = build_type_tester(text_only_columns)
# If text_only_columns are present, prevent coercing empty string or
# literal 'null' strings to a None representation.
column_types = build_type_tester(
text_only_columns,
string_null_values=()
)

return agate.Table(rows, column_names, column_types=column_types)


Expand All @@ -86,19 +96,34 @@ def table_from_data(data, column_names: Iterable[str]) -> agate.Table:


def table_from_data_flat(data, column_names: Iterable[str]) -> agate.Table:
"Convert list of dictionaries into an Agate table"
"""
Convert a list of dictionaries into an Agate table. This method does not
coerce string values into more specific types (eg. '005' will not be
coerced to '5'). Additionally, this method does not coerce values to
None (eg. '' or 'null' will retain their string literal representations).
"""

rows = []
text_only_columns = set()
for _row in data:
row = []
for value in list(_row.values()):
for col_name in column_names:
value = _row[col_name]
if isinstance(value, (dict, list, tuple)):
row.append(json.dumps(value, cls=dbt.utils.JSONEncoder))
else:
row.append(value)
# Represent container types as json strings
value = json.dumps(value, cls=dbt.utils.JSONEncoder)
text_only_columns.add(col_name)
elif isinstance(value, str):
text_only_columns.add(col_name)
row.append(value)

rows.append(row)

return table_from_rows(rows=rows, column_names=column_names)
return table_from_rows(
rows=rows,
column_names=column_names,
text_only_columns=text_only_columns
)


def empty_table():
Expand Down
29 changes: 29 additions & 0 deletions test/integration/100_rpc_test/sql/generic.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

/*
Every column returned by this query should be interpreted
as a string. If any result is coerced into a non-string (eg.
an int, float, bool, date, NoneType, etc) then dbt did the
wrong thing
*/

select
'' as str_empty_string,

'null' as str_null,

'1' as str_int,
'00005' as str_int_2,
'00' as str_int_3,

'1.1' as str_float,
'00001.1' as str_float_2,

'true' as str_bool,
'True' as str_bool_2,

'2021-01-01' as str_date,
'2021-01-01T12:00:00Z' as str_datetime,

-- this is obviously not a date... but Agate used to think it was!
-- see: https://github.com/fishtown-analytics/dbt/issues/2984
'0010T00000aabbccdd' as str_obviously_not_date
21 changes: 21 additions & 0 deletions test/integration/100_rpc_test/test_execute_fetch_and_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ def do_test_file(self, filename):
self.assertTrue(len(table.rows) > 0, "agate table had no rows")

self.do_test_pickle(table)
return table

def assert_all_columns_are_strings(self, table):
for row in table:
for value in row:
self.assertEqual(type(value), str, f'Found a not-string: {value} in row {row}')

@use_profile('bigquery')
def test__bigquery_fetch_and_serialize(self):
Expand All @@ -51,3 +57,18 @@ def test__snowflake_fetch_and_serialize(self):
@use_profile('redshift')
def test__redshift_fetch_and_serialize(self):
self.do_test_file('redshift.sql')

@use_profile('bigquery')
def test__bigquery_type_coercion(self):
table = self.do_test_file('generic.sql')
self.assert_all_columns_are_strings(table)

@use_profile('snowflake')
def test__snowflake_type_coercion(self):
table = self.do_test_file('generic.sql')
self.assert_all_columns_are_strings(table)

@use_profile('redshift')
def test__redshift_type_coercion(self):
table = self.do_test_file('generic.sql')
self.assert_all_columns_are_strings(table)
21 changes: 21 additions & 0 deletions test/unit/test_agate_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,24 @@ def test_merge_mixed(self):
assert isinstance(result.column_types[1], agate.data_types.Text)
assert isinstance(result.column_types[2], agate.data_types.Text)
self.assertEqual(len(result), 6)

def test_nocast_string_types(self):
# String fields should not be coerced into a representative type
# See: https://github.com/fishtown-analytics/dbt/issues/2984

column_names = ['a', 'b', 'c', 'd', 'e']
result_set = [
{'a': '0005', 'b': '01T00000aabbccdd', 'c': 'true', 'd': 10, 'e': False},
{'a': '0006', 'b': '01T00000aabbccde', 'c': 'false', 'd': 11, 'e': True},
]

tbl = agate_helper.table_from_data_flat(data=result_set, column_names=column_names)
self.assertEqual(len(tbl), len(result_set))

expected = [
['0005', '01T00000aabbccdd', 'true', Decimal(10), False],
['0006', '01T00000aabbccde', 'false', Decimal(11), True],
]

for i, row in enumerate(tbl):
self.assertEqual(list(row), expected[i])

0 comments on commit 1ad1c83

Please sign in to comment.