fix(spark): Custom annotation for SUBSTRING() (#4004)

* fix(spark): Custom annotation for SUBSTRING() * Remove empty line * PR Feedback 1
tobymao · Aug 29, 2024 · 4b7ca2b · 4b7ca2b
1 parent fcaae87
commit 4b7ca2b
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 0 deletions.
diff --git a/sqlglot/dialects/spark2.py b/sqlglot/dialects/spark2.py
@@ -111,6 +111,11 @@ def temporary_storage_provider(expression: exp.Expression) -> exp.Expression:
 
 
 class Spark2(Hive):
+    ANNOTATORS = {
+        **Hive.ANNOTATORS,
+        exp.Substring: lambda self, e: self._annotate_by_args(e, "this"),
+    }
+
     class Parser(Hive.Parser):
         TRIM_PATTERN_FIRST = True
 

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -1345,3 +1345,26 @@ def gen_expr(depth: int) -> exp.Expression:
         self.assertEqual(4, normalization_distance(gen_expr(2), max_=100))
         self.assertEqual(18, normalization_distance(gen_expr(3), max_=100))
         self.assertEqual(110, normalization_distance(gen_expr(10), max_=100))
+
+    def test_custom_annotators(self):
+        # In Spark hierarchy, SUBSTRING result type is dependent on input expr type
+        for dialect in ("spark2", "spark", "databricks"):
+            for expr_type_pair in (
+                ("col", "STRING"),
+                ("col", "BINARY"),
+                ("'str_literal'", "STRING"),
+                ("CAST('str_literal' AS BINARY)", "BINARY"),
+            ):
+                with self.subTest(
+                    f"Testing {dialect}'s SUBSTRING() result type for {expr_type_pair}"
+                ):
+                    expr, type = expr_type_pair
+                    ast = parse_one(f"SELECT substring({expr}, 2, 3) AS x FROM tbl", read=dialect)
+
+                    subst_type = (
+                        optimizer.optimize(ast, schema={"tbl": {"col": type}}, dialect=dialect)
+                        .expressions[0]
+                        .type
+                    )
+
+                    self.assertEqual(subst_type.sql(dialect), exp.DataType.build(type).sql(dialect))