From 6d5381472be944f6da38944b01078b5686b25be6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Wa=C5=9B?= <jan.was@starburstdata.com>
Date: Thu, 25 Aug 2022 16:15:15 +0200
Subject: [PATCH 1/4] Parameterize data generation for benchmarks

Sometimes we want to run benchmarks on a small dataset.
We want to be able to choose the scale factors for TPC-H and TPC-DS.
This change keeps the original defaults while allowing for scale factor
and format overrides.
---
 testing/trino-benchto-benchmarks/README.md    |   2 +-
 .../generate_schemas/generate-tpcds.py        | 134 +++++++++++-------
 .../generate_schemas/generate-tpch.py         | 103 +++++++++-----
 3 files changed, 152 insertions(+), 87 deletions(-)

diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md
index cb8d9697df1a..a7f782d97ac2 100644
--- a/testing/trino-benchto-benchmarks/README.md
+++ b/testing/trino-benchto-benchmarks/README.md
@@ -66,7 +66,7 @@ macros:
 * Make sure you have configured [Presto TPC-H connector](https://trino.io/docs/current/connector/tpch.html).
 * Bootstrap benchmark data:
   ```bash
-  python presto-benchto-benchmarks/generate_schemas/generate-tpch.py | presto-cli-[version]-executable.jar --server [presto_coordinator-url]:[port]
+  testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py --factors sf1 --formats orc | trino-cli-[version]-executable.jar --server [trino_coordinator-url]:[port]
   ```
 
 ### Configuring overrides file
diff --git a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py
index 5638308ce700..0008f062b433 100755
--- a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py
+++ b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py
@@ -1,53 +1,85 @@
 #!/usr/bin/env python
 
-schemas = [
-    # (new_schema, source_schema)
-    ('tpcds_sf10_orc', 'tpcds.sf10'),
-    ('tpcds_sf30_orc', 'tpcds.sf30'),
-    ('tpcds_sf100_orc', 'tpcds.sf100'),
-    ('tpcds_sf300_orc', 'tpcds.sf300'),
-    ('tpcds_sf1000_orc', 'tpcds.sf1000'),
-    ('tpcds_sf3000_orc', 'tpcds.sf3000'),
-    ('tpcds_sf10000_orc', 'tpcds.sf10000'),
-]
-
-tables = [
-    'call_center',
-    'catalog_page',
-    'catalog_returns',
-    'catalog_sales',
-    'customer',
-    'customer_address',
-    'customer_demographics',
-    'date_dim',
-    'household_demographics',
-    'income_band',
-    'inventory',
-    'item',
-    'promotion',
-    'reason',
-    'ship_mode',
-    'store',
-    'store_returns',
-    'store_sales',
-    'time_dim',
-    'warehouse',
-    'web_page',
-    'web_returns',
-    'web_sales',
-    'web_site',
-]
-
-for (new_schema, source_schema) in schemas:
-
-    if new_schema.endswith('_orc'):
-        format = 'ORC'
-    elif new_schema.endswith('_text'):
-        format = 'TEXTFILE'
-    else:
-        raise ValueError(new_schema)
-
-    print('CREATE SCHEMA hive.{};'.format(new_schema,))
-    for table in tables:
-        print('CREATE TABLE "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format(
-              new_schema, table, format, source_schema, table))
+import argparse
+
+
+def generate(factors, formats, tables):
+    for format in formats:
+        for factor in factors:
+            new_schema = "tpcds_" + factor + "_" + format
+            source_schema = "tpcds." + factor
+            print(
+                "CREATE SCHEMA IF NOT EXISTS hive.{};".format(
+                    new_schema,
+                )
+            )
+            for table in tables:
+                print(
+                    'CREATE TABLE IF NOT EXISTS "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format(
+                        new_schema, table, format, source_schema, table
+                    )
+                )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate test data.")
+    parser.add_argument(
+        "--factors",
+        type=csvtype(
+            ["tiny", "sf1", "sf10", "sf30", "sf100", "sf300", "sf1000", "sf3000", "sf10000"]
+        ),
+        default=["sf10", "sf30", "sf100", "sf300", "sf1000", "sf3000", "sf10000"],
+    )
+    parser.add_argument("--formats", type=csvtype(["orc", "text"]), default=["orc"])
+    default_tables = [
+        "call_center",
+        "catalog_page",
+        "catalog_returns",
+        "catalog_sales",
+        "customer",
+        "customer_address",
+        "customer_demographics",
+        "date_dim",
+        "household_demographics",
+        "income_band",
+        "inventory",
+        "item",
+        "promotion",
+        "reason",
+        "ship_mode",
+        "store",
+        "store_returns",
+        "store_sales",
+        "time_dim",
+        "warehouse",
+        "web_page",
+        "web_returns",
+        "web_sales",
+        "web_site",
+    ]
+    parser.add_argument(
+        "--tables", type=csvtype(default_tables), default=default_tables
+    )
+    args = parser.parse_args()
+    generate(args.factors, args.formats, args.tables)
+
+
+def csvtype(choices):
+    """Return a function that splits and checks comma-separated values."""
+
+    def splitarg(arg):
+        values = arg.split(",")
+        for value in values:
+            if value not in choices:
+                raise argparse.ArgumentTypeError(
+                    "invalid choice: {!r} (choose from {})".format(
+                        value, ", ".join(map(repr, choices))
+                    )
+                )
+        return values
+
+    return splitarg
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py
index e843ef7865e1..bf092de4a897 100755
--- a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py
+++ b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py
@@ -1,37 +1,70 @@
 #!/usr/bin/env python
 
-schemas = [
-    # (new_schema, source_schema)
-    ('tpch_sf300_orc', 'tpch.sf300'),
-    ('tpch_sf1000_orc', 'tpch.sf1000'),
-    ('tpch_sf3000_orc', 'tpch.sf3000'),
-
-    ('tpch_sf300_text', 'hive.tpch_sf300_orc'),
-    ('tpch_sf1000_text', 'hive.tpch_sf1000_orc'),
-    ('tpch_sf3000_text', 'hive.tpch_sf3000_orc'),
-]
-
-tables = [
-    'customer',
-    'lineitem',
-    'nation',
-    'orders',
-    'part',
-    'partsupp',
-    'region',
-    'supplier',
-]
-
-for (new_schema, source_schema) in schemas:
-
-    if new_schema.endswith('_orc'):
-        format = 'ORC'
-    elif new_schema.endswith('_text'):
-        format = 'TEXTFILE'
-    else:
-        raise ValueError(new_schema)
-
-    print('CREATE SCHEMA hive.{};'.format(new_schema,))
-    for table in tables:
-        print('CREATE TABLE "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format(
-              new_schema, table, format, source_schema, table))
+import argparse
+
+
+def generate(factors, formats, tables):
+    for format in formats:
+        for factor in factors:
+            new_schema = "tpch_" + factor + "_" + format
+            source_schema = "tpch." + factor
+            print(
+                "CREATE SCHEMA IF NOT EXISTS hive.{};".format(
+                    new_schema,
+                )
+            )
+            for table in tables:
+                print(
+                    'CREATE TABLE IF NOT EXISTS "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format(
+                        new_schema, table, format, source_schema, table
+                    )
+                )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate test data.")
+    parser.add_argument(
+        "--factors",
+        type=csvtype(["tiny", "sf1", "sf100", "sf300", "sf1000", "sf3000"]),
+        default=["sf300", "sf1000", "sf3000"],
+    )
+    parser.add_argument(
+        "--formats", type=csvtype(["orc", "text"]), default=["orc", "text"]
+    )
+    default_tables = [
+        "customer",
+        "lineitem",
+        "nation",
+        "orders",
+        "part",
+        "partsupp",
+        "region",
+        "supplier",
+    ]
+    parser.add_argument(
+        "--tables", type=csvtype(default_tables), default=default_tables
+    )
+
+    args = parser.parse_args()
+    generate(args.factors, args.formats, args.tables)
+
+
+def csvtype(choices):
+    """Return a function that splits and checks comma-separated values."""
+
+    def splitarg(arg):
+        values = arg.split(",")
+        for value in values:
+            if value not in choices:
+                raise argparse.ArgumentTypeError(
+                    "invalid choice: {!r} (choose from {})".format(
+                        value, ", ".join(map(repr, choices))
+                    )
+                )
+        return values
+
+    return splitarg
+
+
+if __name__ == "__main__":
+    main()

From 4dca72e31d60bc14e0dbd498483fa4a2b36dfbe7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=C5=9Alizak?= <michal.slizak+github@gmail.com>
Date: Fri, 7 Oct 2022 22:28:35 +0200
Subject: [PATCH 2/4] Rename Presto to Trino in README

---
 testing/trino-benchto-benchmarks/README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md
index a7f782d97ac2..94cbf8bb2953 100644
--- a/testing/trino-benchto-benchmarks/README.md
+++ b/testing/trino-benchto-benchmarks/README.md
@@ -1,14 +1,14 @@
-# Presto Benchto benchmarks
+# Trino Benchto benchmarks
 
 The Benchto benchmarks utilize [Benchto](https://github.com/trinodb/benchto) benchmarking
-utility to do macro benchmarking of Presto. As opposed to micro benchmarking which exercises
-a class or a small, coherent set of classes, macro benchmarks done with Benchto use Presto
-end-to-end, by accessing it through its API (usually with `presto-jdbc`), executing queries,
+utility to do macro benchmarking of Trino. As opposed to micro benchmarking which exercises
+a class or a small, coherent set of classes, macro benchmarks done with Benchto use Trino
+end-to-end, by accessing it through its API (usually with `trino-jdbc`), executing queries,
 measuring time and gathering various metrics.
 
 ## Benchmarking suites
 
-Even though benchmarks exercise Presto end-to-end, a single benchmark cannot use all Presto
+Even though benchmarks exercise Trino end-to-end, a single benchmark cannot use all Trino
 features. Therefore benchmarks are organized in suites, like:
 
 * *tpch* - queries closely following the [TPC-H](http://www.tpc.org/tpch/) benchmark
@@ -18,7 +18,7 @@ features. Therefore benchmarks are organized in suites, like:
 
 ### Requirements
 
-* Presto already installed on the target environment
+* Trino already installed on the target environment
 * Basic understanding of Benchto [components and architecture](https://github.com/trinodb/benchto)
 * Benchto service [configured and running](https://github.com/trinodb/benchto/tree/master/benchto-service)
 * An environment [defined in Benchto service](https://github.com/trinodb/benchto/tree/master/benchto-service#creating-environment)
@@ -27,10 +27,10 @@ features. Therefore benchmarks are organized in suites, like:
 
 Benchto driver needs to know two things: what benchmark is to be run and what environment
 it is to be run on. For the purpose of the following example, we will use `tpch` benchmark
-and Presto server running at `localhost:8080`, with Benchto service running at `localhost:8081`.
+and Trino server running at `localhost:8080`, with Benchto service running at `localhost:8081`.
 
 Benchto driver uses Spring Boot to locate environment configuration file, so to pass the
-configuration. To continue with our example, one needs to place an `application-presto-devenv.yaml`
+configuration. To continue with our example, one needs to place an `application-trino-devenv.yaml`
 file in the current directory (i.e. the directory from which the benchmark will be invoked),
 with the following content:
 
@@ -63,7 +63,7 @@ macros:
 
 ### Bootstrapping benchmark data
 
-* Make sure you have configured [Presto TPC-H connector](https://trino.io/docs/current/connector/tpch.html).
+* Make sure you have configured [Trino TPC-H connector](https://trino.io/docs/current/connector/tpch.html).
 * Bootstrap benchmark data:
   ```bash
   testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py --factors sf1 --formats orc | trino-cli-[version]-executable.jar --server [trino_coordinator-url]:[port]
@@ -88,6 +88,6 @@ With the scene set up as in the previous section, the benchmark can be run with:
 java -Xmx1g -jar trino-benchto-benchmarks/target/trino-benchto-benchmarks-*-executable.jar \
     --sql trino-benchto-benchmarks/src/main/resources/sql \
     --benchmarks trino-benchto-benchmarks/src/main/resources/benchmarks \
-    --activeBenchmarks=presto/tpch --profile=presto-devenv \
+    --activeBenchmarks=trino/tpch --profile=trino-devenv \
     --overrides overrides.yaml
 ```

From 8e51f89b66a68dba4aa61c449cce11685ba4b578 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=C5=9Alizak?= <michal.slizak+github@gmail.com>
Date: Fri, 7 Oct 2022 15:24:07 +0200
Subject: [PATCH 3/4] Additional updates to README

---
 testing/trino-benchto-benchmarks/README.md | 25 ++++++++++++++--------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md
index 94cbf8bb2953..5c167e9b05ec 100644
--- a/testing/trino-benchto-benchmarks/README.md
+++ b/testing/trino-benchto-benchmarks/README.md
@@ -30,11 +30,15 @@ it is to be run on. For the purpose of the following example, we will use `tpch`
 and Trino server running at `localhost:8080`, with Benchto service running at `localhost:8081`.
 
 Benchto driver uses Spring Boot to locate environment configuration file, so to pass the
-configuration. To continue with our example, one needs to place an `application-trino-devenv.yaml`
+configuration. To continue with our example, one needs to place an `application.yaml`
 file in the current directory (i.e. the directory from which the benchmark will be invoked),
 with the following content:
 
 ```yaml
+benchmarks: src/main/resources/benchmarks
+sql: sql/main/resources/sql
+query-results-dir: target/results
+
 benchmark-service:
   url: http://localhost:8081
 
@@ -42,7 +46,6 @@ data-sources:
   trino:
     url: jdbc:trino://localhost:8080
     username: na
-    password: na
     driver-class-name: io.trino.jdbc.TrinoDriver
 
 environment:
@@ -50,6 +53,7 @@ environment:
 
 presto:
   url: http://localhost:8080
+  username: na
 
 benchmark:
   feature:
@@ -77,17 +81,20 @@ runs or different underlying schemas. Create a simple `overrides.yaml` file:
 
 ```yaml
 runs: 10
-tpch_medium: tpcds_10gb_txt
+tpch_300: tpch_sf1_orc
+scale_300: 1
+tpch_1000: tpch_sf1_orc
+scale_1000: 1
+tpch_3000: tpch_sf1_orc
+scale_3000: 1
+prefix: ""
 ```
 
 ### Running benchto-driver
 
 With the scene set up as in the previous section, the benchmark can be run with:
 ```bash
-./mvnw clean package -pl :trino-benchto-benchmarks
-java -Xmx1g -jar trino-benchto-benchmarks/target/trino-benchto-benchmarks-*-executable.jar \
-    --sql trino-benchto-benchmarks/src/main/resources/sql \
-    --benchmarks trino-benchto-benchmarks/src/main/resources/benchmarks \
-    --activeBenchmarks=trino/tpch --profile=trino-devenv \
-    --overrides overrides.yaml
+java -jar "$HOME/.m2/repository/io/trino/benchto/benchto-driver/0.18/benchto-driver-0.18.jar" \
+            --activeBenchmarks=trino/tpch \
+            --overrides "overrides.yaml"
 ```

From 78c5ef4bd2b084b8ba6c2097903ddca506b7b716 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=C5=9Alizak?= <michal.slizak+github@gmail.com>
Date: Fri, 7 Oct 2022 22:36:53 +0200
Subject: [PATCH 4/4] Update to Benchto which is compatible with JDK 17

---
 pom.xml                                    | 2 +-
 testing/trino-benchto-benchmarks/README.md | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index ebedf7c979a1..2b24c608ab65 100644
--- a/pom.xml
+++ b/pom.xml
@@ -656,7 +656,7 @@
             <dependency>
                 <groupId>io.trino.benchto</groupId>
                 <artifactId>benchto-driver</artifactId>
-                <version>0.19</version>
+                <version>0.20</version>
             </dependency>
 
             <dependency>
diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md
index 5c167e9b05ec..1a3f47f0b6c5 100644
--- a/testing/trino-benchto-benchmarks/README.md
+++ b/testing/trino-benchto-benchmarks/README.md
@@ -94,7 +94,8 @@ prefix: ""
 
 With the scene set up as in the previous section, the benchmark can be run with:
 ```bash
-java -jar "$HOME/.m2/repository/io/trino/benchto/benchto-driver/0.18/benchto-driver-0.18.jar" \
+./mvnw clean package -pl :trino-benchto-benchmarks
+java -jar "$HOME/.m2/repository/io/trino/benchto/benchto-driver/0.20/benchto-driver-0.20-exec.jar" \
             --activeBenchmarks=trino/tpch \
             --overrides "overrides.yaml"
 ```