From 1ad1c834f3dca6523ca68347b90f70dd34d1daa1 Mon Sep 17 00:00:00 2001
From: Drew Banin <drew@fishtownanalytics.com>
Date: Sat, 26 Jun 2021 12:39:16 -0400
Subject: [PATCH] (#2984) Prevent Agate from coercing values in query result
 sets

---
 CHANGELOG.md                                  |  1 +
 core/dbt/clients/agate_helper.py              | 45 ++++++++++++++-----
 test/integration/100_rpc_test/sql/generic.sql | 29 ++++++++++++
 .../test_execute_fetch_and_serialize.py       | 21 +++++++++
 test/unit/test_agate_helper.py                | 21 +++++++++
 5 files changed, 107 insertions(+), 10 deletions(-)
 create mode 100644 test/integration/100_rpc_test/sql/generic.sql

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f9206d2545e..c4cc7bd162e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ Contributors:
 ### Fixes
 
 - Handle quoted values within test configs, such as `where` ([#3458](https://github.com/fishtown-analytics/dbt/issues/3458), [#3459](https://github.com/fishtown-analytics/dbt/pull/3459))
+- Fix type coercion issues when fetching query result sets ([#2984](https://github.com/fishtown-analytics/dbt/issues/2984), [#3499](https://github.com/fishtown-analytics/dbt/pull/3499))
 
 ### Under the hood
 
diff --git a/core/dbt/clients/agate_helper.py b/core/dbt/clients/agate_helper.py
index 29f285edac3..d2f9298df48 100644
--- a/core/dbt/clients/agate_helper.py
+++ b/core/dbt/clients/agate_helper.py
@@ -35,7 +35,11 @@ def cast(self, d):
         )
 
 
-def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
+def build_type_tester(
+    text_columns: Iterable[str],
+    string_null_values: Optional[Iterable[str]] = ('null', '')
+) -> agate.TypeTester:
+
     types = [
         agate.data_types.Number(null_values=('null', '')),
         agate.data_types.Date(null_values=('null', ''),
@@ -46,10 +50,10 @@ def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
         agate.data_types.Boolean(true_values=('true',),
                                  false_values=('false',),
                                  null_values=('null', '')),
-        agate.data_types.Text(null_values=('null', ''))
+        agate.data_types.Text(null_values=string_null_values)
     ]
     force = {
-        k: agate.data_types.Text(null_values=('null', ''))
+        k: agate.data_types.Text(null_values=string_null_values)
         for k in text_columns
     }
     return agate.TypeTester(force=force, types=types)
@@ -66,7 +70,13 @@ def table_from_rows(
     if text_only_columns is None:
         column_types = DEFAULT_TYPE_TESTER
     else:
-        column_types = build_type_tester(text_only_columns)
+        # If text_only_columns are present, prevent coercing empty string or
+        # literal 'null' strings to a None representation.
+        column_types = build_type_tester(
+            text_only_columns,
+            string_null_values=()
+        )
+
     return agate.Table(rows, column_names, column_types=column_types)
 
 
@@ -86,19 +96,34 @@ def table_from_data(data, column_names: Iterable[str]) -> agate.Table:
 
 
 def table_from_data_flat(data, column_names: Iterable[str]) -> agate.Table:
-    "Convert list of dictionaries into an Agate table"
+    """
+    Convert a list of dictionaries into an Agate table. This method does not
+    coerce string values into more specific types (eg. '005' will not be
+    coerced to '5'). Additionally, this method does not coerce values to
+    None (eg. '' or 'null' will retain their string literal representations).
+    """
 
     rows = []
+    text_only_columns = set()
     for _row in data:
         row = []
-        for value in list(_row.values()):
+        for col_name in column_names:
+            value = _row[col_name]
             if isinstance(value, (dict, list, tuple)):
-                row.append(json.dumps(value, cls=dbt.utils.JSONEncoder))
-            else:
-                row.append(value)
+                # Represent container types as json strings
+                value = json.dumps(value, cls=dbt.utils.JSONEncoder)
+                text_only_columns.add(col_name)
+            elif isinstance(value, str):
+                text_only_columns.add(col_name)
+            row.append(value)
+
         rows.append(row)
 
-    return table_from_rows(rows=rows, column_names=column_names)
+    return table_from_rows(
+        rows=rows,
+        column_names=column_names,
+        text_only_columns=text_only_columns
+    )
 
 
 def empty_table():
diff --git a/test/integration/100_rpc_test/sql/generic.sql b/test/integration/100_rpc_test/sql/generic.sql
new file mode 100644
index 00000000000..5e00cdc46f8
--- /dev/null
+++ b/test/integration/100_rpc_test/sql/generic.sql
@@ -0,0 +1,29 @@
+
+/*
+    Every column returned by this query should be interpreted
+    as a string. If any result is coerced into a non-string (eg.
+    an int, float, bool, date, NoneType, etc) then dbt did the
+    wrong thing
+*/
+
+select
+    '' as str_empty_string,
+
+    'null' as str_null,
+
+    '1' as str_int,
+    '00005' as str_int_2,
+    '00' as str_int_3,
+
+    '1.1' as str_float,
+    '00001.1' as str_float_2,
+
+    'true' as str_bool,
+    'True' as str_bool_2,
+
+    '2021-01-01' as str_date,
+    '2021-01-01T12:00:00Z' as str_datetime,
+
+    -- this is obviously not a date... but Agate used to think it was!
+    -- see: https://github.com/fishtown-analytics/dbt/issues/2984
+    '0010T00000aabbccdd' as str_obviously_not_date
diff --git a/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py b/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py
index d5fa2ba76fe..6c4a73eaf74 100644
--- a/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py
+++ b/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py
@@ -39,6 +39,12 @@ def do_test_file(self, filename):
         self.assertTrue(len(table.rows) > 0, "agate table had no rows")
 
         self.do_test_pickle(table)
+        return table
+
+    def assert_all_columns_are_strings(self, table):
+        for row in table:
+            for value in row:
+                self.assertEqual(type(value), str, f'Found a not-string: {value} in row {row}')
 
     @use_profile('bigquery')
     def test__bigquery_fetch_and_serialize(self):
@@ -51,3 +57,18 @@ def test__snowflake_fetch_and_serialize(self):
     @use_profile('redshift')
     def test__redshift_fetch_and_serialize(self):
         self.do_test_file('redshift.sql')
+
+    @use_profile('bigquery')
+    def test__bigquery_type_coercion(self):
+        table = self.do_test_file('generic.sql')
+        self.assert_all_columns_are_strings(table)
+
+    @use_profile('snowflake')
+    def test__snowflake_type_coercion(self):
+        table = self.do_test_file('generic.sql')
+        self.assert_all_columns_are_strings(table)
+
+    @use_profile('redshift')
+    def test__redshift_type_coercion(self):
+        table = self.do_test_file('generic.sql')
+        self.assert_all_columns_are_strings(table)
diff --git a/test/unit/test_agate_helper.py b/test/unit/test_agate_helper.py
index 8f15ff75984..6add176c601 100644
--- a/test/unit/test_agate_helper.py
+++ b/test/unit/test_agate_helper.py
@@ -133,3 +133,24 @@ def test_merge_mixed(self):
         assert isinstance(result.column_types[1], agate.data_types.Text)
         assert isinstance(result.column_types[2], agate.data_types.Text)
         self.assertEqual(len(result), 6)
+
+    def test_nocast_string_types(self):
+        # String fields should not be coerced into a representative type
+        # See: https://github.com/fishtown-analytics/dbt/issues/2984
+
+        column_names = ['a', 'b', 'c', 'd', 'e']
+        result_set = [
+            {'a': '0005', 'b': '01T00000aabbccdd', 'c': 'true', 'd': 10, 'e': False},
+            {'a': '0006', 'b': '01T00000aabbccde', 'c': 'false', 'd': 11, 'e': True},
+        ]
+
+        tbl = agate_helper.table_from_data_flat(data=result_set, column_names=column_names)
+        self.assertEqual(len(tbl), len(result_set))
+
+        expected = [
+            ['0005', '01T00000aabbccdd', 'true', Decimal(10), False],
+            ['0006', '01T00000aabbccde', 'false', Decimal(11), True],
+        ]
+
+        for i, row in enumerate(tbl):
+            self.assertEqual(list(row), expected[i])