Skip to content

Commit

Permalink
Merge pull request #3499 from dbt-labs/fix/agate-undesirable-casting
Browse files Browse the repository at this point in the history
Prevent Agate from coercing values in query result sets
  • Loading branch information
drewbanin authored Jul 1, 2021
2 parents b9d5123 + 85fe32b commit 83b98c8
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Fixes
- Fix docs generation for cross-db sources in REDSHIFT RA3 node ([#3236](https://github.com/fishtown-analytics/dbt/issues/3236), [#3408](https://github.com/fishtown-analytics/dbt/pull/3408))
- Fix type coercion issues when fetching query result sets ([#2984](https://github.com/fishtown-analytics/dbt/issues/2984), [#3499](https://github.com/fishtown-analytics/dbt/pull/3499))

### Under the hood
- Improve default view and table materialization performance by checking relational cache before attempting to drop temp relations ([#3112](https://github.com/fishtown-analytics/dbt/issues/3112), [#3468](https://github.com/fishtown-analytics/dbt/pull/3468))
Expand Down
45 changes: 35 additions & 10 deletions core/dbt/clients/agate_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ def cast(self, d):
)


def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
def build_type_tester(
text_columns: Iterable[str],
string_null_values: Optional[Iterable[str]] = ('null', '')
) -> agate.TypeTester:

types = [
agate.data_types.Number(null_values=('null', '')),
agate.data_types.Date(null_values=('null', ''),
Expand All @@ -46,10 +50,10 @@ def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
agate.data_types.Boolean(true_values=('true',),
false_values=('false',),
null_values=('null', '')),
agate.data_types.Text(null_values=('null', ''))
agate.data_types.Text(null_values=string_null_values)
]
force = {
k: agate.data_types.Text(null_values=('null', ''))
k: agate.data_types.Text(null_values=string_null_values)
for k in text_columns
}
return agate.TypeTester(force=force, types=types)
Expand All @@ -66,7 +70,13 @@ def table_from_rows(
if text_only_columns is None:
column_types = DEFAULT_TYPE_TESTER
else:
column_types = build_type_tester(text_only_columns)
# If text_only_columns are present, prevent coercing empty string or
# literal 'null' strings to a None representation.
column_types = build_type_tester(
text_only_columns,
string_null_values=()
)

return agate.Table(rows, column_names, column_types=column_types)


Expand All @@ -86,19 +96,34 @@ def table_from_data(data, column_names: Iterable[str]) -> agate.Table:


def table_from_data_flat(data, column_names: Iterable[str]) -> agate.Table:
"Convert list of dictionaries into an Agate table"
"""
Convert a list of dictionaries into an Agate table. This method does not
coerce string values into more specific types (eg. '005' will not be
coerced to '5'). Additionally, this method does not coerce values to
None (eg. '' or 'null' will retain their string literal representations).
"""

rows = []
text_only_columns = set()
for _row in data:
row = []
for value in list(_row.values()):
for col_name in column_names:
value = _row[col_name]
if isinstance(value, (dict, list, tuple)):
row.append(json.dumps(value, cls=dbt.utils.JSONEncoder))
else:
row.append(value)
# Represent container types as json strings
value = json.dumps(value, cls=dbt.utils.JSONEncoder)
text_only_columns.add(col_name)
elif isinstance(value, str):
text_only_columns.add(col_name)
row.append(value)

rows.append(row)

return table_from_rows(rows=rows, column_names=column_names)
return table_from_rows(
rows=rows,
column_names=column_names,
text_only_columns=text_only_columns
)


def empty_table():
Expand Down
29 changes: 29 additions & 0 deletions test/integration/100_rpc_test/sql/generic.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

/*
Every column returned by this query should be interpreted
as a string. If any result is coerced into a non-string (eg.
an int, float, bool, date, NoneType, etc) then dbt did the
wrong thing
*/

select
'' as str_empty_string,

'null' as str_null,

'1' as str_int,
'00005' as str_int_2,
'00' as str_int_3,

'1.1' as str_float,
'00001.1' as str_float_2,

'true' as str_bool,
'True' as str_bool_2,

'2021-01-01' as str_date,
'2021-01-01T12:00:00Z' as str_datetime,

-- this is obviously not a date... but Agate used to think it was!
-- see: https://github.com/fishtown-analytics/dbt/issues/2984
'0010T00000aabbccdd' as str_obviously_not_date
21 changes: 21 additions & 0 deletions test/integration/100_rpc_test/test_execute_fetch_and_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ def do_test_file(self, filename):
self.assertTrue(len(table.rows) > 0, "agate table had no rows")

self.do_test_pickle(table)
return table

def assert_all_columns_are_strings(self, table):
for row in table:
for value in row:
self.assertEqual(type(value), str, f'Found a not-string: {value} in row {row}')

@use_profile('bigquery')
def test__bigquery_fetch_and_serialize(self):
Expand All @@ -51,3 +57,18 @@ def test__snowflake_fetch_and_serialize(self):
@use_profile('redshift')
def test__redshift_fetch_and_serialize(self):
self.do_test_file('redshift.sql')

@use_profile('bigquery')
def test__bigquery_type_coercion(self):
table = self.do_test_file('generic.sql')
self.assert_all_columns_are_strings(table)

@use_profile('snowflake')
def test__snowflake_type_coercion(self):
table = self.do_test_file('generic.sql')
self.assert_all_columns_are_strings(table)

@use_profile('redshift')
def test__redshift_type_coercion(self):
table = self.do_test_file('generic.sql')
self.assert_all_columns_are_strings(table)
21 changes: 21 additions & 0 deletions test/unit/test_agate_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,24 @@ def test_merge_mixed(self):
assert isinstance(result.column_types[1], agate.data_types.Text)
assert isinstance(result.column_types[2], agate.data_types.Text)
self.assertEqual(len(result), 6)

def test_nocast_string_types(self):
# String fields should not be coerced into a representative type
# See: https://github.com/fishtown-analytics/dbt/issues/2984

column_names = ['a', 'b', 'c', 'd', 'e']
result_set = [
{'a': '0005', 'b': '01T00000aabbccdd', 'c': 'true', 'd': 10, 'e': False},
{'a': '0006', 'b': '01T00000aabbccde', 'c': 'false', 'd': 11, 'e': True},
]

tbl = agate_helper.table_from_data_flat(data=result_set, column_names=column_names)
self.assertEqual(len(tbl), len(result_set))

expected = [
['0005', '01T00000aabbccdd', 'true', Decimal(10), False],
['0006', '01T00000aabbccde', 'false', Decimal(11), True],
]

for i, row in enumerate(tbl):
self.assertEqual(list(row), expected[i])

0 comments on commit 83b98c8

Please sign in to comment.