SQL: Allow sorting of groups by aggregates (#38042) (#38255)

Introduce client-side sorting of groups based on aggregate functions. To allow this, the Analyzer has been extended to push down to underlying Aggregate, aggregate function and the Querier has been extended to identify the case and consume the results in order and sort them based on the given columns. The underlying QueryContainer has been slightly modified to allow a view of the underlying values being extracted as the columns used for sorting might not be requested by the user. The PR also adds minor tweaks, mainly related to tree output. Close #35118 (cherry picked from commit 783c9ed)
elastic · Feb 2, 2019 · db24e5c · db24e5c
1 parent 1c845d6
commit db24e5c
Show file tree

Hide file tree

Showing 60 changed files with 1,343 additions and 401 deletions.
diff --git a/docs/reference/sql/limitations.asciidoc b/docs/reference/sql/limitations.asciidoc
@@ -67,8 +67,18 @@ a field is an array (has multiple values) or not, so without reading all the dat
 === Sorting by aggregation
 
 When doing aggregations (`GROUP BY`) {es-sql} relies on {es}'s `composite` aggregation for its support for paginating results.
-But this type of aggregation does come with a limitation: sorting can only be applied on the key used for the aggregation's buckets. This
-means that queries like `SELECT * FROM test GROUP BY age ORDER BY COUNT(*)` are not possible.
+However this type of aggregation does come with a limitation: sorting can only be applied on the key used for the aggregation's buckets. 
+{es-sql} overcomes this limitation by doing client-side sorting however as a safety measure, allows only up to *512* rows.
+
+It is recommended to use `LIMIT` for queries that use sorting by aggregation, essentially indicating the top N results that are desired:
+
+[source, sql]
+--------------------------------------------------
+SELECT * FROM test GROUP BY age ORDER BY COUNT(*) LIMIT 100;
+--------------------------------------------------
+
+It is possible to run the same queries without a `LIMIT` however in that case if the maximum size (*512*) is passed, an exception will be
+returned as {es-sql} is unable to track (and sort) all the results returned.
 
 [float]
 === Using aggregation functions on top of scalar functions

diff --git a/...qa/single-node/src/test/java/org/elasticsearch/xpack/sql/qa/single_node/CliExplainIT.java b/...qa/single-node/src/test/java/org/elasticsearch/xpack/sql/qa/single_node/CliExplainIT.java
@@ -20,7 +20,7 @@ public void testExplainBasic() throws IOException {
         assertThat(readLine(), startsWith("----------"));
         assertThat(readLine(), startsWith("With[{}]"));
         assertThat(readLine(), startsWith("\\_Project[[?*]]"));
-        assertThat(readLine(), startsWith("  \\_UnresolvedRelation[[][index=test],null,Unknown index [test]]"));
+        assertThat(readLine(), startsWith("  \\_UnresolvedRelation[test]"));
         assertEquals("", readLine());
 
         assertThat(command("EXPLAIN " + (randomBoolean() ? "" : "(PLAN ANALYZED) ") + "SELECT * FROM test"), containsString("plan"));
@@ -64,22 +64,22 @@ public void testExplainWithWhere() throws IOException {
         assertThat(readLine(), startsWith("----------"));
         assertThat(readLine(), startsWith("With[{}]"));
         assertThat(readLine(), startsWith("\\_Project[[?*]]"));
-        assertThat(readLine(), startsWith("  \\_Filter[i = 2#"));
-        assertThat(readLine(), startsWith("    \\_UnresolvedRelation[[][index=test],null,Unknown index [test]]"));
+        assertThat(readLine(), startsWith("  \\_Filter[Equals[?i,2"));
+        assertThat(readLine(), startsWith("    \\_UnresolvedRelation[test]"));
         assertEquals("", readLine());
 
         assertThat(command("EXPLAIN " + (randomBoolean() ? "" : "(PLAN ANALYZED) ") + "SELECT * FROM test WHERE i = 2"),
                 containsString("plan"));
         assertThat(readLine(), startsWith("----------"));
         assertThat(readLine(), startsWith("Project[[i{f}#"));
-        assertThat(readLine(), startsWith("\\_Filter[i = 2#"));
+        assertThat(readLine(), startsWith("\\_Filter[Equals[i"));
         assertThat(readLine(), startsWith("  \\_EsRelation[test][i{f}#"));
         assertEquals("", readLine());
 
         assertThat(command("EXPLAIN (PLAN OPTIMIZED) SELECT * FROM test WHERE i = 2"), containsString("plan"));
         assertThat(readLine(), startsWith("----------"));
         assertThat(readLine(), startsWith("Project[[i{f}#"));
-        assertThat(readLine(), startsWith("\\_Filter[i = 2#"));
+        assertThat(readLine(), startsWith("\\_Filter[Equals[i"));
         assertThat(readLine(), startsWith("  \\_EsRelation[test][i{f}#"));
         assertEquals("", readLine());
 
@@ -124,20 +124,20 @@ public void testExplainWithCount() throws IOException {
         assertThat(command("EXPLAIN (PLAN PARSED) SELECT COUNT(*) FROM test"), containsString("plan"));
         assertThat(readLine(), startsWith("----------"));
         assertThat(readLine(), startsWith("With[{}]"));
-        assertThat(readLine(), startsWith("\\_Project[[?COUNT(*)]]"));
-        assertThat(readLine(), startsWith("  \\_UnresolvedRelation[[][index=test],null,Unknown index [test]]"));
+        assertThat(readLine(), startsWith("\\_Project[[?COUNT[?*]]]"));
+        assertThat(readLine(), startsWith("  \\_UnresolvedRelation[test]"));
         assertEquals("", readLine());
 
         assertThat(command("EXPLAIN " + (randomBoolean() ? "" : "(PLAN ANALYZED) ") + "SELECT COUNT(*) FROM test"),
                 containsString("plan"));
         assertThat(readLine(), startsWith("----------"));
-        assertThat(readLine(), startsWith("Aggregate[[],[COUNT(*)#"));
+        assertThat(readLine(), startsWith("Aggregate[[],[Count[*=1"));
         assertThat(readLine(), startsWith("\\_EsRelation[test][i{f}#"));
         assertEquals("", readLine());
 
         assertThat(command("EXPLAIN (PLAN OPTIMIZED) SELECT COUNT(*) FROM test"), containsString("plan"));
         assertThat(readLine(), startsWith("----------"));
-        assertThat(readLine(), startsWith("Aggregate[[],[COUNT(*)#"));
+        assertThat(readLine(), startsWith("Aggregate[[],[Count[*=1"));
         assertThat(readLine(), startsWith("\\_EsRelation[test][i{f}#"));
         assertEquals("", readLine());
 

diff --git a/x-pack/plugin/sql/qa/src/main/java/org/elasticsearch/xpack/sql/qa/cli/ErrorsTestCase.java b/x-pack/plugin/sql/qa/src/main/java/org/elasticsearch/xpack/sql/qa/cli/ErrorsTestCase.java
@@ -73,7 +73,7 @@ public void testSelectProjectScoreInAggContext() throws Exception {
     public void testSelectOrderByScoreInAggContext() throws Exception {
         index("test", body -> body.field("foo", 1));
         assertFoundOneProblem(command("SELECT foo, COUNT(*) FROM test GROUP BY foo ORDER BY SCORE()"));
-        assertEquals("line 1:54: Cannot order by non-grouped column [SCORE()], expected [foo]" + END, readLine());
+        assertEquals("line 1:54: Cannot order by non-grouped column [SCORE()], expected [foo] or an aggregate function" + END, readLine());
     }
 
     @Override

diff --git a/x-pack/plugin/sql/qa/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/ErrorsTestCase.java b/x-pack/plugin/sql/qa/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/ErrorsTestCase.java
@@ -81,7 +81,9 @@ public void testSelectOrderByScoreInAggContext() throws Exception {
         try (Connection c = esJdbc()) {
             SQLException e = expectThrows(SQLException.class, () ->
                 c.prepareStatement("SELECT foo, COUNT(*) FROM test GROUP BY foo ORDER BY SCORE()").executeQuery());
-            assertEquals("Found 1 problem(s)\nline 1:54: Cannot order by non-grouped column [SCORE()], expected [foo]", e.getMessage());
+            assertEquals(
+                    "Found 1 problem(s)\nline 1:54: Cannot order by non-grouped column [SCORE()], expected [foo] or an aggregate function",
+                    e.getMessage());
         }
     }
 

diff --git a/x-pack/plugin/sql/qa/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/SqlSpecTestCase.java b/x-pack/plugin/sql/qa/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/SqlSpecTestCase.java
@@ -38,6 +38,7 @@ public static List<Object[]> readScriptSpec() throws Exception {
         tests.addAll(readScriptSpec("/datetime.sql-spec", parser));
         tests.addAll(readScriptSpec("/math.sql-spec", parser));
         tests.addAll(readScriptSpec("/agg.sql-spec", parser));
+        tests.addAll(readScriptSpec("/agg-ordering.sql-spec", parser));
         tests.addAll(readScriptSpec("/arithmetic.sql-spec", parser));
         tests.addAll(readScriptSpec("/string-functions.sql-spec", parser));
         tests.addAll(readScriptSpec("/case-functions.sql-spec", parser));

diff --git a/x-pack/plugin/sql/qa/src/main/resources/agg-ordering.sql-spec b/x-pack/plugin/sql/qa/src/main/resources/agg-ordering.sql-spec
@@ -0,0 +1,87 @@
+//
+// Custom sorting/ordering on aggregates
+//
+
+countWithImplicitGroupBy
+SELECT MAX(salary) AS m FROM test_emp ORDER BY COUNT(*);
+
+countWithImplicitGroupByWithHaving
+SELECT MAX(salary) AS m FROM test_emp HAVING MIN(salary) > 1 ORDER BY COUNT(*);
+
+countAndMaxWithImplicitGroupBy
+SELECT MAX(salary) AS m FROM test_emp ORDER BY MAX(salary), COUNT(*);
+
+maxWithAliasWithImplicitGroupBy
+SELECT MAX(salary) AS m FROM test_emp ORDER BY m;
+
+maxWithAliasWithImplicitGroupByAndHaving
+SELECT MAX(salary) AS m FROM test_emp HAVING COUNT(*) > 1 ORDER BY m;
+
+multipleOrderWithImplicitGroupByWithHaving
+SELECT MAX(salary) AS m FROM test_emp HAVING MIN(salary) > 1 ORDER BY COUNT(*), m DESC;
+
+multipleOrderWithImplicitGroupByWithoutAlias
+SELECT MAX(salary) AS m FROM test_emp HAVING MIN(salary) > 1 ORDER BY COUNT(*), MIN(salary) DESC;
+
+multipleOrderWithImplicitGroupByOfOrdinals
+SELECT MAX(salary) AS max, MIN(salary) AS min FROM test_emp HAVING MIN(salary) > 1 ORDER BY 1, COUNT(*), 2 DESC;
+
+aggWithoutAlias
+SELECT MAX(salary) AS max FROM test_emp GROUP BY gender ORDER BY MAX(salary);
+
+aggWithAlias
+SELECT MAX(salary) AS m FROM test_emp GROUP BY gender ORDER BY m;
+
+multipleAggsThatGetRewrittenWithoutAlias
+SELECT MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY gender ORDER BY MAX(salary);
+
+multipleAggsThatGetRewrittenWithAliasDesc
+SELECT MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY gender ORDER BY 1 DESC;
+
+multipleAggsThatGetRewrittenWithAlias
+SELECT MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY gender ORDER BY max;
+
+aggNotSpecifiedInTheAggregate
+SELECT MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender ORDER BY MAX(salary);
+
+aggNotSpecifiedInTheAggregatePlusOrdinal
+SELECT MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender ORDER BY MAX(salary), 2 DESC;
+
+aggNotSpecifiedInTheAggregateWithHaving
+SELECT MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender HAVING c > 1 ORDER BY MAX(salary);
+
+aggNotSpecifiedInTheAggregateWithHavingDesc
+SELECT MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender HAVING c > 1 ORDER BY MAX(salary) DESC;
+
+aggNotSpecifiedInTheAggregateAndGroupWithHaving
+SELECT gender, MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender HAVING c > 1 ORDER BY MAX(salary), gender;
+
+groupAndAggNotSpecifiedInTheAggregateWithHaving
+SELECT gender, MIN(salary) AS min, COUNT(*) AS c FROM test_emp GROUP BY gender HAVING c > 1 ORDER BY gender, MAX(salary);
+
+multipleAggsThatGetRewrittenWithAliasOnAMediumGroupBy
+SELECT languages, MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY languages ORDER BY max;
+
+multipleAggsThatGetRewrittenWithAliasOnALargeGroupBy
+SELECT emp_no, MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY emp_no ORDER BY max;
+
+multipleAggsThatGetRewrittenWithAliasOnAMediumGroupByWithHaving
+SELECT languages, MAX(salary) AS max, MIN(salary) AS min FROM test_emp GROUP BY languages HAVING min BETWEEN 1000 AND 99999 ORDER BY max;
+
+aggNotSpecifiedInTheAggregatemultipleAggsThatGetRewrittenWithAliasOnALargeGroupBy
+SELECT emp_no, MIN(salary) AS min FROM test_emp GROUP BY emp_no ORDER BY MAX(salary);
+
+aggNotSpecifiedWithHavingOnLargeGroupBy
+SELECT MAX(salary) AS max FROM test_emp GROUP BY emp_no HAVING AVG(salary) > 1000 ORDER BY MIN(salary);
+
+aggWithTieBreakerDescAsc
+SELECT emp_no, MIN(languages) AS min FROM test_emp GROUP BY emp_no ORDER BY MIN(languages) DESC NULLS FIRST, emp_no ASC;
+
+aggWithTieBreakerDescDesc
+SELECT emp_no, MIN(languages) AS min FROM test_emp GROUP BY emp_no ORDER BY MIN(languages) DESC NULLS FIRST, emp_no DESC;
+
+aggWithTieBreakerAscDesc
+SELECT emp_no, MIN(languages) AS min FROM test_emp GROUP BY emp_no ORDER BY MAX(languages) ASC NULLS FIRST, emp_no DESC;
+
+aggWithMixOfOrdinals
+SELECT gender AS g, MAX(salary) AS m FROM test_emp GROUP BY gender ORDER BY 2 DESC LIMIT 3;
diff --git a/x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/analysis/analyzer/Analyzer.java b/x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/analysis/analyzer/Analyzer.java
@@ -52,6 +52,8 @@
 import org.elasticsearch.xpack.sql.type.DataTypes;
 import org.elasticsearch.xpack.sql.type.InvalidMappedField;
 import org.elasticsearch.xpack.sql.type.UnsupportedEsField;
+import org.elasticsearch.xpack.sql.util.CollectionUtils;
+import org.elasticsearch.xpack.sql.util.Holder;
 
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -106,7 +108,8 @@ protected Iterable<RuleExecutor<LogicalPlan>.Batch> batches() {
                 new ResolveFunctions(),
                 new ResolveAliases(),
                 new ProjectedAggregations(),
-                new ResolveAggsInHaving()
+                new ResolveAggsInHaving(),
+                new ResolveAggsInOrderBy()
                 //new ImplicitCasting()
                 );
         Batch finish = new Batch("Finish Analysis",
@@ -926,62 +929,57 @@ protected LogicalPlan rule(Project p) {
     // Handle aggs in HAVING. To help folding any aggs not found in Aggregation
     // will be pushed down to the Aggregate and then projected. This also simplifies the Verifier's job.
     //
-    private class ResolveAggsInHaving extends AnalyzeRule<LogicalPlan> {
+    private class ResolveAggsInHaving extends AnalyzeRule<Filter> {
 
         @Override
         protected boolean skipResolved() {
             return false;
         }
 
         @Override
-        protected LogicalPlan rule(LogicalPlan plan) {
+        protected LogicalPlan rule(Filter f) {
             // HAVING = Filter followed by an Agg
-            if (plan instanceof Filter) {
-                Filter f = (Filter) plan;
-                if (f.child() instanceof Aggregate && f.child().resolved()) {
-                    Aggregate agg = (Aggregate) f.child();
+            if (f.child() instanceof Aggregate && f.child().resolved()) {
+                Aggregate agg = (Aggregate) f.child();
 
-                    Set<NamedExpression> missing = null;
-                    Expression condition = f.condition();
+                Set<NamedExpression> missing = null;
+                Expression condition = f.condition();
 
-                    // the condition might contain an agg (AVG(salary)) that could have been resolved
-                    // (salary cannot be pushed down to Aggregate since there's no grouping and thus the function wasn't resolved either)
+                // the condition might contain an agg (AVG(salary)) that could have been resolved
+                // (salary cannot be pushed down to Aggregate since there's no grouping and thus the function wasn't resolved either)
 
-                    // so try resolving the condition in one go through a 'dummy' aggregate
-                    if (!condition.resolved()) {
-                        // that's why try to resolve the condition
-                        Aggregate tryResolvingCondition = new Aggregate(agg.source(), agg.child(), agg.groupings(),
-                                combine(agg.aggregates(), new Alias(f.source(), ".having", condition)));
+                // so try resolving the condition in one go through a 'dummy' aggregate
+                if (!condition.resolved()) {
+                    // that's why try to resolve the condition
+                    Aggregate tryResolvingCondition = new Aggregate(agg.source(), agg.child(), agg.groupings(),
+                            combine(agg.aggregates(), new Alias(f.source(), ".having", condition)));
 
-                        tryResolvingCondition = (Aggregate) analyze(tryResolvingCondition, false);
+                    tryResolvingCondition = (Aggregate) analyze(tryResolvingCondition, false);
 
-                        // if it got resolved
-                        if (tryResolvingCondition.resolved()) {
-                            // replace the condition with the resolved one
-                            condition = ((Alias) tryResolvingCondition.aggregates()
-                                .get(tryResolvingCondition.aggregates().size() - 1)).child();
-                        } else {
-                            // else bail out
-                            return plan;
-                        }
+                    // if it got resolved
+                    if (tryResolvingCondition.resolved()) {
+                        // replace the condition with the resolved one
+                        condition = ((Alias) tryResolvingCondition.aggregates()
+                            .get(tryResolvingCondition.aggregates().size() - 1)).child();
+                    } else {
+                        // else bail out
+                        return f;
                     }
+                }
 
-                    missing = findMissingAggregate(agg, condition);
-
-                    if (!missing.isEmpty()) {
-                        Aggregate newAgg = new Aggregate(agg.source(), agg.child(), agg.groupings(),
-                                combine(agg.aggregates(), missing));
-                        Filter newFilter = new Filter(f.source(), newAgg, condition);
-                        // preserve old output
-                        return new Project(f.source(), newFilter, f.output());
-                    }
+                missing = findMissingAggregate(agg, condition);
 
-                    return new Filter(f.source(), f.child(), condition);
+                if (!missing.isEmpty()) {
+                    Aggregate newAgg = new Aggregate(agg.source(), agg.child(), agg.groupings(),
+                            combine(agg.aggregates(), missing));
+                    Filter newFilter = new Filter(f.source(), newAgg, condition);
+                    // preserve old output
+                    return new Project(f.source(), newFilter, f.output());
                 }
-                return plan;
-            }
 
-            return plan;
+                return new Filter(f.source(), f.child(), condition);
+            }
+            return f;
         }
 
         private Set<NamedExpression> findMissingAggregate(Aggregate target, Expression from) {
@@ -1001,6 +999,66 @@ private Set<NamedExpression> findMissingAggregate(Aggregate target, Expression f
         }
     }
 
+
+    //
+    // Handle aggs in ORDER BY. To help folding any aggs not found in Aggregation
+    // will be pushed down to the Aggregate and then projected. This also simplifies the Verifier's job.
+    // Similar to Having however using a different matching pattern since HAVING is always Filter with Agg,
+    // while an OrderBy can have multiple intermediate nodes (Filter,Project, etc...)
+    //
+    private static class ResolveAggsInOrderBy extends AnalyzeRule<OrderBy> {
+
+        @Override
+        protected boolean skipResolved() {
+            return false;
+        }
+
+        @Override
+        protected LogicalPlan rule(OrderBy ob) {
+            List<Order> orders = ob.order();
+
+            // 1. collect aggs inside an order by
+            List<NamedExpression> aggs = new ArrayList<>();
+            for (Order order : orders) {
+                if (Functions.isAggregate(order.child())) {
+                    aggs.add(Expressions.wrapAsNamed(order.child()));
+                }
+            }
+            if (aggs.isEmpty()) {
+                return ob;
+            }
+
+            // 2. find first Aggregate child and update it
+            final Holder<Boolean> found = new Holder<>(Boolean.FALSE);
+
+            LogicalPlan plan = ob.transformDown(a -> {
+                if (found.get() == Boolean.FALSE) {
+                    found.set(Boolean.TRUE);
+
+                    List<NamedExpression> missing = new ArrayList<>();
+
+                    for (NamedExpression orderedAgg : aggs) {
+                        if (Expressions.anyMatch(a.aggregates(), e -> Expressions.equalsAsAttribute(e, orderedAgg)) == false) {
+                            missing.add(orderedAgg);
+                        }
+                    }
+                    // agg already contains all aggs
+                    if (missing.isEmpty() == false) {
+                        // save aggregates
+                        return new Aggregate(a.source(), a.child(), a.groupings(), CollectionUtils.combine(a.aggregates(), missing));
+                    }
+                }
+                return a;
+            }, Aggregate.class);
+
+            // if the plan was updated, project the initial aggregates
+            if (plan != ob) {
+                return new Project(ob.source(), plan, ob.output());
+            }
+            return ob;
+        }
+    }
+
     private class PruneDuplicateFunctions extends AnalyzeRule<LogicalPlan> {
 
         @Override