Add sort verification for writer fuzzer (facebookincubator#10235)

Summary: Pull Request resolved: facebookincubator#10235 Reviewed By: xiaoxmeng Differential Revision: D58762369 Pulled By: kewang1024 fbshipit-source-id: 2ba3aab92957b698699148f533eb5a5a9e8ba7c0
Joe-Abraham · Jun 25, 2024 · 456f2a6 · 456f2a6
1 parent dc834b7
commit 456f2a6
Show file tree

Hide file tree

Showing 7 changed files with 241 additions and 61 deletions.
diff --git a/velox/docs/develop/testing/writer-fuzzer.rst b/velox/docs/develop/testing/writer-fuzzer.rst
@@ -3,17 +3,19 @@ Writer Fuzzer
 =============
 
 Writer fuzzer tests table write plan with up to 5 regular columns, up to
-3 partition keys and up to 3 bucket columns.
+3 partition keys, up to 3 bucket columns, up to 3 sorted columns.
 
-At each iteration, fuzzer randomly generate a table write plan with different
-table properties including un-partitioned and partitioned, non-bucketed and bucketed.
+At each iteration, fuzzer randomly generates a table write plan with different
+table properties including un-partitioned and partitioned, non-bucketed and bucketed,
+sorted and unsorted.
 
 The fuzzer then generates inputs and runs the query plan and compares the
 results with PrestoDB.
 As of now, we compare:
 1. How many rows were written.
 2. Output directories have the same directory layout and hierarchy.
-3. Same data were written by velox and prestoDB.
+3. Same data were written by velox and prestoDB including bucket number.
+4. Data of sorted columns is in the same order if table is sorted.
 
 How to run
 ----------

diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp
@@ -573,8 +573,13 @@ std::optional<std::string> PrestoQueryRunner::toSql(
 
   // Returns a CTAS sql with specified table properties from TableWriteNode,
   // example sql:
-  // CREATE TABLE tmp_write WITH (PARTITIONED_BY = ARRAY['p0'], BUCKETED_COUNT =
-  // 20, BUCKETED_BY = ARRAY['b0', 'b1']) AS SELECT * FROM tmp
+  // CREATE TABLE tmp_write WITH (
+  // PARTITIONED_BY = ARRAY['p0'],
+  // BUCKETED_COUNT = 2, BUCKETED_BY = ARRAY['b0', 'b1'],
+  // SORTED_BY = ARRAY['s0 ASC', 's1 DESC'],
+  // FORMAT = 'ORC'
+  // )
+  // AS SELECT * FROM tmp
   std::stringstream sql;
   sql << "CREATE TABLE tmp_write";
   std::vector<std::string> partitionKeys;
@@ -583,31 +588,43 @@ std::optional<std::string> PrestoQueryRunner::toSql(
       partitionKeys.push_back(insertTableHandle->inputColumns()[i]->name());
     }
   }
+  sql << " WITH (";
 
   if (insertTableHandle->isPartitioned()) {
-    sql << " WITH (PARTITIONED_BY = ARRAY[";
+    sql << " PARTITIONED_BY = ARRAY[";
     for (int i = 0; i < partitionKeys.size(); ++i) {
       appendComma(i, sql);
       sql << "'" << partitionKeys[i] << "'";
     }
-    sql << "]";
+    sql << "], ";
 
     if (insertTableHandle->bucketProperty() != nullptr) {
       const auto bucketCount =
           insertTableHandle->bucketProperty()->bucketCount();
       const auto bucketColumns =
           insertTableHandle->bucketProperty()->bucketedBy();
-      sql << ", BUCKET_COUNT = " << bucketCount << ", BUCKETED_BY = ARRAY[";
+      sql << " BUCKET_COUNT = " << bucketCount << ", BUCKETED_BY = ARRAY[";
       for (int i = 0; i < bucketColumns.size(); ++i) {
         appendComma(i, sql);
         sql << "'" << bucketColumns[i] << "'";
       }
-      sql << "]";
+      sql << "], ";
+
+      const auto sortColumns = insertTableHandle->bucketProperty()->sortedBy();
+      if (!sortColumns.empty()) {
+        sql << " SORTED_BY = ARRAY[";
+        for (int i = 0; i < sortColumns.size(); ++i) {
+          appendComma(i, sql);
+          sql << "'" << sortColumns[i]->sortColumn() << " "
+              << (sortColumns[i]->sortOrder().isAscending() ? "ASC" : "DESC")
+              << "'";
+        }
+        sql << "], ";
+      }
     }
-    sql << ")";
   }
 
-  sql << " AS SELECT * FROM tmp";
+  sql << "FORMAT = 'ORC')  AS SELECT * FROM tmp";
   return sql.str();
 }