-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prevent Agate from coercing values in query result sets #3499
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,11 @@ def cast(self, d): | |
) | ||
|
||
|
||
def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester: | ||
def build_type_tester( | ||
text_columns: Iterable[str], | ||
string_null_values: Optional[Iterable[str]] = ('null', '') | ||
) -> agate.TypeTester: | ||
|
||
types = [ | ||
agate.data_types.Number(null_values=('null', '')), | ||
agate.data_types.Date(null_values=('null', ''), | ||
|
@@ -46,10 +50,10 @@ def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester: | |
agate.data_types.Boolean(true_values=('true',), | ||
false_values=('false',), | ||
null_values=('null', '')), | ||
agate.data_types.Text(null_values=('null', '')) | ||
agate.data_types.Text(null_values=string_null_values) | ||
] | ||
force = { | ||
k: agate.data_types.Text(null_values=('null', '')) | ||
k: agate.data_types.Text(null_values=string_null_values) | ||
for k in text_columns | ||
} | ||
return agate.TypeTester(force=force, types=types) | ||
|
@@ -66,7 +70,13 @@ def table_from_rows( | |
if text_only_columns is None: | ||
column_types = DEFAULT_TYPE_TESTER | ||
else: | ||
column_types = build_type_tester(text_only_columns) | ||
# If text_only_columns are present, prevent coercing empty string or | ||
# literal 'null' strings to a None representation. | ||
column_types = build_type_tester( | ||
text_only_columns, | ||
string_null_values=() | ||
) | ||
|
||
return agate.Table(rows, column_names, column_types=column_types) | ||
|
||
|
||
|
@@ -86,19 +96,34 @@ def table_from_data(data, column_names: Iterable[str]) -> agate.Table: | |
|
||
|
||
def table_from_data_flat(data, column_names: Iterable[str]) -> agate.Table: | ||
"Convert list of dictionaries into an Agate table" | ||
""" | ||
Convert a list of dictionaries into an Agate table. This method does not | ||
coerce string values into more specific types (eg. '005' will not be | ||
coerced to '5'). Additionally, this method does not coerce values to | ||
None (eg. '' or 'null' will retain their string literal representations). | ||
""" | ||
|
||
rows = [] | ||
text_only_columns = set() | ||
for _row in data: | ||
row = [] | ||
for value in list(_row.values()): | ||
for col_name in column_names: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think our existing implementation assumed that the order of keys in a dictionary was guaranteed... but IIRC that's not true for all versions of Python. My Python is rusty, so maybe this change is not necessary. I can revert if need be :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not my area, but I do remember hearing that this changed in recent versions of python. Indeed this blog post suggests that dictionaries are ordered by insertion as of python 3.6, and that this is officially guaranteed as of python 3.7. dbt currently requires py36 or higher. |
||
value = _row[col_name] | ||
if isinstance(value, (dict, list, tuple)): | ||
row.append(json.dumps(value, cls=dbt.utils.JSONEncoder)) | ||
else: | ||
row.append(value) | ||
# Represent container types as json strings | ||
value = json.dumps(value, cls=dbt.utils.JSONEncoder) | ||
text_only_columns.add(col_name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was originally just going to peek at the first row in the result set, but it's possible that the value in a string column returned by the database could be |
||
elif isinstance(value, str): | ||
text_only_columns.add(col_name) | ||
row.append(value) | ||
|
||
rows.append(row) | ||
|
||
return table_from_rows(rows=rows, column_names=column_names) | ||
return table_from_rows( | ||
rows=rows, | ||
column_names=column_names, | ||
text_only_columns=text_only_columns | ||
) | ||
|
||
|
||
def empty_table(): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
|
||
/* | ||
Every column returned by this query should be interpreted | ||
as a string. If any result is coerced into a non-string (eg. | ||
an int, float, bool, date, NoneType, etc) then dbt did the | ||
wrong thing | ||
*/ | ||
|
||
select | ||
'' as str_empty_string, | ||
|
||
'null' as str_null, | ||
|
||
'1' as str_int, | ||
'00005' as str_int_2, | ||
'00' as str_int_3, | ||
|
||
'1.1' as str_float, | ||
'00001.1' as str_float_2, | ||
|
||
'true' as str_bool, | ||
'True' as str_bool_2, | ||
|
||
'2021-01-01' as str_date, | ||
'2021-01-01T12:00:00Z' as str_datetime, | ||
|
||
-- this is obviously not a date... but Agate used to think it was! | ||
-- see: https://github.com/fishtown-analytics/dbt/issues/2984 | ||
'0010T00000aabbccdd' as str_obviously_not_date |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good!
Before:
After: