diff --git a/src/sqlfluff/dialects/dialect_spark3.py b/src/sqlfluff/dialects/dialect_spark3.py index 096595e91ed..829035aaaf4 100644 --- a/src/sqlfluff/dialects/dialect_spark3.py +++ b/src/sqlfluff/dialects/dialect_spark3.py @@ -180,7 +180,8 @@ Sequence("ORDER", "BY"), Sequence("CLUSTER", "BY"), Sequence("DISTRIBUTE", "BY"), - # TODO Add PIVOT, LATERAL VIEW, SORT BY and DISTRIBUTE BY clauses + Sequence("SORT", "BY"), + # TODO Add PIVOT, LATERAL VIEW, and DISTRIBUTE BY clauses "HAVING", "WINDOW", Ref("SetOperatorSegment"), @@ -280,7 +281,7 @@ # Add Spark Grammar BucketSpecGrammar=Sequence( Ref("ClusteredBySpecGrammar"), - Ref("SortSpecGrammar", optional=True), + Ref("SortedBySpecGrammar", optional=True), "INTO", Ref("NumericLiteralSegment"), "BUCKETS", @@ -358,7 +359,7 @@ Ref("ResourceFileGrammar"), Ref("QuotedLiteralSegment"), ), - SortSpecGrammar=Sequence( + SortedBySpecGrammar=Sequence( "SORTED", "BY", Bracketed( @@ -1301,10 +1302,10 @@ class SelectStatementSegment(BaseSegment): ).parse_grammar.copy( # TODO New Rule: Warn of mutual exclusion of following clauses # DISTRIBUTE, SORT, CLUSTER and ORDER BY if multiple specified - # TODO Insert: SORT BY clauses insert=[ Ref("ClusterByClauseSegment", optional=True), Ref("DistributeByClauseSegment", optional=True), + Ref("SortByClauseSegment", optional=True), ], before=Ref("LimitClauseSegment"), ) @@ -1434,6 +1435,56 @@ class GroupingExpressionList(BaseSegment): ) +@spark3_dialect.segment() +class SortByClauseSegment(BaseSegment): + """A `SORT BY` clause like in `SELECT`. + + This clause is mutually exclusive with SORT BY, ORDER BY and DISTRIBUTE BY. + https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-sortby.html + """ + + type = "sort_by_clause" + + match_grammar = StartsWith( + Sequence("SORT", "BY"), + terminator=OneOf( + "LIMIT", + "HAVING", + "QUALIFY", + # For window functions + "WINDOW", + Ref("FrameClauseUnitGrammar"), + "SEPARATOR", + ), + ) + parse_grammar = Sequence( + "SORT", + "BY", + Indent, + Delimited( + Sequence( + OneOf( + Ref("ColumnReferenceSegment"), + # Can `ORDER BY 1` + Ref("NumericLiteralSegment"), + # Can order by an expression + Ref("ExpressionSegment"), + ), + OneOf("ASC", "DESC", optional=True), + # NB: This isn't really ANSI, and isn't supported in Mysql, + # but is supported in enough other dialects for it to make + # sense here for now. + Sequence("NULLS", OneOf("FIRST", "LAST"), optional=True), + ), + terminator=OneOf( + "LIMIT", + Ref("FrameClauseUnitGrammar"), + ), + ), + Dedent, + ) + + # Auxiliary Statements @spark3_dialect.segment() class AddExecutablePackage(BaseSegment): diff --git a/test/fixtures/dialects/spark3/select_sort_by.sql b/test/fixtures/dialects/spark3/select_sort_by.sql new file mode 100644 index 00000000000..bec408e20e6 --- /dev/null +++ b/test/fixtures/dialects/spark3/select_sort_by.sql @@ -0,0 +1,112 @@ +-- Sort rows within each partition in ascending manner +SELECT /*+ REPARTITION(zip_code) */ + name, + age, + zip_code +FROM person +SORT BY + name; + +SELECT + name, + age, + zip_code +FROM person +SORT BY + name; + +-- Sort rows within each partition using column position. +SELECT /*+ REPARTITION(zip_code) */ + name, + age, + zip_code +FROM person +SORT BY 1; + +SELECT + name, + age, + zip_code +FROM person +SORT BY 1; + +-- Sort rows within partition in ascending +-- manner keeping null values to be last. +SELECT /*+ REPARTITION(zip_code) */ + age, + name, + zip_code +FROM person +SORT BY + age NULLS LAST; + +SELECT + age, + name, + zip_code +FROM person +SORT BY + age NULLS LAST; + +-- Sort rows by age within each partition in +-- descending manner, which defaults to NULL LAST. +SELECT /*+ REPARTITION(zip_code) */ + age, + name, + zip_code +FROM person +SORT BY + age DESC; + +SELECT + age, + name, + zip_code +FROM person +SORT BY + age DESC; + +-- Sort rows by age within each partition in +-- descending manner keeping null values to be first. +SELECT /*+ REPARTITION(zip_code) */ + age, + name, + zip_code +FROM person +SORT BY + age DESC NULLS FIRST; + +SELECT + age, + name, + zip_code +FROM person +SORT BY + age DESC NULLS FIRST; + +-- Sort rows within each partition based on more +-- than one column with each column having different +-- sort direction. +SELECT /*+ REPARTITION(zip_code) */ + name, + age, + zip_code +FROM person +SORT BY + name ASC, age DESC; + +SELECT + name, + age, + zip_code +FROM person +SORT BY + name ASC, age DESC; + +-- Sort rows within each partition based on result of a function. +SELECT + age, + name +FROM person +SORT BY + LEFT(SUBSTRING_INDEX(name, ' ', -1), 1); diff --git a/test/fixtures/dialects/spark3/select_sort_by.yml b/test/fixtures/dialects/spark3/select_sort_by.yml new file mode 100644 index 00000000000..2c5c05f8842 --- /dev/null +++ b/test/fixtures/dialects/spark3/select_sort_by.yml @@ -0,0 +1,479 @@ +# YML test files are auto-generated from SQL files and should not be edited by +# hand. To help enforce this, the "hash" field in the file must match a hash +# computed by SQLFluff when running the tests. Please run +# `python test/generate_parse_fixture_yml.py` to generate them after adding or +# altering SQL files. +_hash: 013e20883076666f60cecb20b65c41da3b0cfbecc717df808d6f87b88c727202 +file: +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_modifier: + select_hint: + - keyword: /*+ + - hint_function: + function_name: + function_name_identifier: REPARTITION + bracketed: + start_bracket: ( + identifier: zip_code + end_bracket: ) + - keyword: '*/' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: name +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: name +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_modifier: + select_hint: + - keyword: /*+ + - hint_function: + function_name: + function_name_identifier: REPARTITION + bracketed: + start_bracket: ( + identifier: zip_code + end_bracket: ) + - keyword: '*/' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - literal: '1' +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - literal: '1' +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_modifier: + select_hint: + - keyword: /*+ + - hint_function: + function_name: + function_name_identifier: REPARTITION + bracketed: + start_bracket: ( + identifier: zip_code + end_bracket: ) + - keyword: '*/' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: age + - keyword: NULLS + - keyword: LAST +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: age + - keyword: NULLS + - keyword: LAST +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_modifier: + select_hint: + - keyword: /*+ + - hint_function: + function_name: + function_name_identifier: REPARTITION + bracketed: + start_bracket: ( + identifier: zip_code + end_bracket: ) + - keyword: '*/' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: age + - keyword: DESC +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: age + - keyword: DESC +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_modifier: + select_hint: + - keyword: /*+ + - hint_function: + function_name: + function_name_identifier: REPARTITION + bracketed: + start_bracket: ( + identifier: zip_code + end_bracket: ) + - keyword: '*/' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: age + - keyword: DESC + - keyword: NULLS + - keyword: FIRST +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: age + - keyword: DESC + - keyword: NULLS + - keyword: FIRST +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_modifier: + select_hint: + - keyword: /*+ + - hint_function: + function_name: + function_name_identifier: REPARTITION + bracketed: + start_bracket: ( + identifier: zip_code + end_bracket: ) + - keyword: '*/' + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: name + - keyword: ASC + - comma: ',' + - column_reference: + identifier: age + - keyword: DESC +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_element: + column_reference: + identifier: name + - comma: ',' + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: zip_code + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - column_reference: + identifier: name + - keyword: ASC + - comma: ',' + - column_reference: + identifier: age + - keyword: DESC +- statement_terminator: ; +- base: + select_statement: + select_clause: + - keyword: SELECT + - select_clause_element: + column_reference: + identifier: age + - comma: ',' + - select_clause_element: + column_reference: + identifier: name + from_clause: + keyword: FROM + from_expression: + from_expression_element: + table_expression: + table_reference: + identifier: person + sort_by_clause: + - keyword: SORT + - keyword: BY + - expression: + function: + function_name: + function_name_identifier: LEFT + bracketed: + - start_bracket: ( + - expression: + function: + function_name: + function_name_identifier: SUBSTRING_INDEX + bracketed: + - start_bracket: ( + - expression: + column_reference: + identifier: name + - comma: ',' + - expression: + literal: "' '" + - comma: ',' + - expression: + numeric_literal: + binary_operator: '-' + literal: '1' + - end_bracket: ) + - comma: ',' + - expression: + literal: '1' + - end_bracket: ) +- statement_terminator: ;