trinodb · sopel39 · Oct 26, 2022 · Aug 25, 2022 · Oct 7, 2022 · Oct 7, 2022
diff --git a/pom.xml b/pom.xml
@@ -656,7 +656,7 @@
             <dependency>
                 <groupId>io.trino.benchto</groupId>
                 <artifactId>benchto-driver</artifactId>
-                <version>0.19</version>
+                <version>0.20</version>
             </dependency>
 
             <dependency>

diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md
@@ -1,14 +1,14 @@
-# Presto Benchto benchmarks
+# Trino Benchto benchmarks
 
 The Benchto benchmarks utilize [Benchto](https://github.com/trinodb/benchto) benchmarking
-utility to do macro benchmarking of Presto. As opposed to micro benchmarking which exercises
-a class or a small, coherent set of classes, macro benchmarks done with Benchto use Presto
-end-to-end, by accessing it through its API (usually with `presto-jdbc`), executing queries,
+utility to do macro benchmarking of Trino. As opposed to micro benchmarking which exercises
+a class or a small, coherent set of classes, macro benchmarks done with Benchto use Trino
+end-to-end, by accessing it through its API (usually with `trino-jdbc`), executing queries,
 measuring time and gathering various metrics.
 
 ## Benchmarking suites
 
-Even though benchmarks exercise Presto end-to-end, a single benchmark cannot use all Presto
+Even though benchmarks exercise Trino end-to-end, a single benchmark cannot use all Trino
 features. Therefore benchmarks are organized in suites, like:
 
 * *tpch* - queries closely following the [TPC-H](http://www.tpc.org/tpch/) benchmark
@@ -18,7 +18,7 @@ features. Therefore benchmarks are organized in suites, like:
 
 ### Requirements
 
-* Presto already installed on the target environment
+* Trino already installed on the target environment
 * Basic understanding of Benchto [components and architecture](https://github.com/trinodb/benchto)
 * Benchto service [configured and running](https://github.com/trinodb/benchto/tree/master/benchto-service)
 * An environment [defined in Benchto service](https://github.com/trinodb/benchto/tree/master/benchto-service#creating-environment)
@@ -27,29 +27,33 @@ features. Therefore benchmarks are organized in suites, like:
 
 Benchto driver needs to know two things: what benchmark is to be run and what environment
 it is to be run on. For the purpose of the following example, we will use `tpch` benchmark
-and Presto server running at `localhost:8080`, with Benchto service running at `localhost:8081`.
+and Trino server running at `localhost:8080`, with Benchto service running at `localhost:8081`.
 
 Benchto driver uses Spring Boot to locate environment configuration file, so to pass the
-configuration. To continue with our example, one needs to place an `application-presto-devenv.yaml`
+configuration. To continue with our example, one needs to place an `application.yaml`
 file in the current directory (i.e. the directory from which the benchmark will be invoked),
 with the following content:
 
 ```yaml
+benchmarks: src/main/resources/benchmarks
+sql: sql/main/resources/sql
+query-results-dir: target/results
+
 benchmark-service:
   url: http://localhost:8081
 
 data-sources:
   trino:
     url: jdbc:trino://localhost:8080
     username: na
-    password: na
     driver-class-name: io.trino.jdbc.TrinoDriver
 
 environment:
   name: TRINO-DEV
 
 presto:
   url: http://localhost:8080
+  username: na
 
 benchmark:
   feature:
@@ -63,10 +67,10 @@ macros:
 
 ### Bootstrapping benchmark data
 
-* Make sure you have configured [Presto TPC-H connector](https://trino.io/docs/current/connector/tpch.html).
+* Make sure you have configured [Trino TPC-H connector](https://trino.io/docs/current/connector/tpch.html).
 * Bootstrap benchmark data:
   ```bash
-  python presto-benchto-benchmarks/generate_schemas/generate-tpch.py | presto-cli-[version]-executable.jar --server [presto_coordinator-url]:[port]
+  testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py --factors sf1 --formats orc | trino-cli-[version]-executable.jar --server [trino_coordinator-url]:[port]
   ```
 
 ### Configuring overrides file
@@ -77,17 +81,21 @@ runs or different underlying schemas. Create a simple `overrides.yaml` file:
 
 ```yaml
 runs: 10
-tpch_medium: tpcds_10gb_txt
+tpch_300: tpch_sf1_orc
+scale_300: 1
+tpch_1000: tpch_sf1_orc
+scale_1000: 1
+tpch_3000: tpch_sf1_orc
+scale_3000: 1
+prefix: ""
 ```
 
 ### Running benchto-driver
 
 With the scene set up as in the previous section, the benchmark can be run with:
 ```bash
 ./mvnw clean package -pl :trino-benchto-benchmarks
-java -Xmx1g -jar trino-benchto-benchmarks/target/trino-benchto-benchmarks-*-executable.jar \
-    --sql trino-benchto-benchmarks/src/main/resources/sql \
-    --benchmarks trino-benchto-benchmarks/src/main/resources/benchmarks \
-    --activeBenchmarks=presto/tpch --profile=presto-devenv \
-    --overrides overrides.yaml
+java -jar "$HOME/.m2/repository/io/trino/benchto/benchto-driver/0.20/benchto-driver-0.20-exec.jar" \
+            --activeBenchmarks=trino/tpch \
+            --overrides "overrides.yaml"
 ```
diff --git a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py
@@ -1,53 +1,85 @@
 #!/usr/bin/env python
 
-schemas = [
-    # (new_schema, source_schema)
-    ('tpcds_sf10_orc', 'tpcds.sf10'),
-    ('tpcds_sf30_orc', 'tpcds.sf30'),
-    ('tpcds_sf100_orc', 'tpcds.sf100'),
-    ('tpcds_sf300_orc', 'tpcds.sf300'),
-    ('tpcds_sf1000_orc', 'tpcds.sf1000'),
-    ('tpcds_sf3000_orc', 'tpcds.sf3000'),
-    ('tpcds_sf10000_orc', 'tpcds.sf10000'),
-]
-
-tables = [
-    'call_center',
-    'catalog_page',
-    'catalog_returns',
-    'catalog_sales',
-    'customer',
-    'customer_address',
-    'customer_demographics',
-    'date_dim',
-    'household_demographics',
-    'income_band',
-    'inventory',
-    'item',
-    'promotion',
-    'reason',
-    'ship_mode',
-    'store',
-    'store_returns',
-    'store_sales',
-    'time_dim',
-    'warehouse',
-    'web_page',
-    'web_returns',
-    'web_sales',
-    'web_site',
-]
-
-for (new_schema, source_schema) in schemas:
-
-    if new_schema.endswith('_orc'):
-        format = 'ORC'
-    elif new_schema.endswith('_text'):
-        format = 'TEXTFILE'
-    else:
-        raise ValueError(new_schema)
-
-    print('CREATE SCHEMA hive.{};'.format(new_schema,))
-    for table in tables:
-        print('CREATE TABLE "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format(
-              new_schema, table, format, source_schema, table))
+import argparse
+
+
+def generate(factors, formats, tables):
+    for format in formats:
+        for factor in factors:
+            new_schema = "tpcds_" + factor + "_" + format
+            source_schema = "tpcds." + factor
+            print(
+                "CREATE SCHEMA IF NOT EXISTS hive.{};".format(
+                    new_schema,
+                )
+            )
+            for table in tables:
+                print(
+                    'CREATE TABLE IF NOT EXISTS "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format(
+                        new_schema, table, format, source_schema, table
+                    )
+                )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate test data.")
+    parser.add_argument(
+        "--factors",
+        type=csvtype(
+            ["tiny", "sf1", "sf10", "sf30", "sf100", "sf300", "sf1000", "sf3000", "sf10000"]
+        ),
+        default=["sf10", "sf30", "sf100", "sf300", "sf1000", "sf3000", "sf10000"],
+    )
+    parser.add_argument("--formats", type=csvtype(["orc", "text"]), default=["orc"])
+    default_tables = [
+        "call_center",
+        "catalog_page",
+        "catalog_returns",
+        "catalog_sales",
+        "customer",
+        "customer_address",
+        "customer_demographics",
+        "date_dim",
+        "household_demographics",
+        "income_band",
+        "inventory",
+        "item",
+        "promotion",
+        "reason",
+        "ship_mode",
+        "store",
+        "store_returns",
+        "store_sales",
+        "time_dim",
+        "warehouse",
+        "web_page",
+        "web_returns",
+        "web_sales",
+        "web_site",
+    ]
+    parser.add_argument(
+        "--tables", type=csvtype(default_tables), default=default_tables
+    )
+    args = parser.parse_args()
+    generate(args.factors, args.formats, args.tables)
+
+
+def csvtype(choices):
+    """Return a function that splits and checks comma-separated values."""
+
+    def splitarg(arg):
+        values = arg.split(",")
+        for value in values:
+            if value not in choices:
+                raise argparse.ArgumentTypeError(
+                    "invalid choice: {!r} (choose from {})".format(
+                        value, ", ".join(map(repr, choices))
+                    )
+                )
+        return values
+
+    return splitarg
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py
@@ -1,37 +1,70 @@
 #!/usr/bin/env python
 
-schemas = [
-    # (new_schema, source_schema)
-    ('tpch_sf300_orc', 'tpch.sf300'),
-    ('tpch_sf1000_orc', 'tpch.sf1000'),
-    ('tpch_sf3000_orc', 'tpch.sf3000'),
-
-    ('tpch_sf300_text', 'hive.tpch_sf300_orc'),
-    ('tpch_sf1000_text', 'hive.tpch_sf1000_orc'),
-    ('tpch_sf3000_text', 'hive.tpch_sf3000_orc'),
-]
-
-tables = [
-    'customer',
-    'lineitem',
-    'nation',
-    'orders',
-    'part',
-    'partsupp',
-    'region',
-    'supplier',
-]
-
-for (new_schema, source_schema) in schemas:
-
-    if new_schema.endswith('_orc'):
-        format = 'ORC'
-    elif new_schema.endswith('_text'):
-        format = 'TEXTFILE'
-    else:
-        raise ValueError(new_schema)
-
-    print('CREATE SCHEMA hive.{};'.format(new_schema,))
-    for table in tables:
-        print('CREATE TABLE "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format(
-              new_schema, table, format, source_schema, table))
+import argparse
+
+
+def generate(factors, formats, tables):
+    for format in formats:
+        for factor in factors:
+            new_schema = "tpch_" + factor + "_" + format
+            source_schema = "tpch." + factor
+            print(
+                "CREATE SCHEMA IF NOT EXISTS hive.{};".format(
+                    new_schema,
+                )
+            )
+            for table in tables:
+                print(
+                    'CREATE TABLE IF NOT EXISTS "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format(
+                        new_schema, table, format, source_schema, table
+                    )
+                )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate test data.")
+    parser.add_argument(
+        "--factors",
+        type=csvtype(["tiny", "sf1", "sf100", "sf300", "sf1000", "sf3000"]),
+        default=["sf300", "sf1000", "sf3000"],
+    )
+    parser.add_argument(
+        "--formats", type=csvtype(["orc", "text"]), default=["orc", "text"]
+    )
+    default_tables = [
+        "customer",
+        "lineitem",
+        "nation",
+        "orders",
+        "part",
+        "partsupp",
+        "region",
+        "supplier",
+    ]
+    parser.add_argument(
+        "--tables", type=csvtype(default_tables), default=default_tables
+    )
+
+    args = parser.parse_args()
+    generate(args.factors, args.formats, args.tables)
+
+
+def csvtype(choices):
+    """Return a function that splits and checks comma-separated values."""
+
+    def splitarg(arg):
+        values = arg.split(",")
+        for value in values:
+            if value not in choices:
+                raise argparse.ArgumentTypeError(
+                    "invalid choice: {!r} (choose from {})".format(
+                        value, ", ".join(map(repr, choices))
+                    )
+                )
+        return values
+
+    return splitarg
+
+
+if __name__ == "__main__":
+    main()