From 6d5381472be944f6da38944b01078b5686b25be6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Wa=C5=9B?= Date: Thu, 25 Aug 2022 16:15:15 +0200 Subject: [PATCH 1/4] Parameterize data generation for benchmarks Sometimes we want to run benchmarks on a small dataset. We want to be able to choose the scale factors for TPC-H and TPC-DS. This change keeps the original defaults while allowing for scale factor and format overrides. --- testing/trino-benchto-benchmarks/README.md | 2 +- .../generate_schemas/generate-tpcds.py | 134 +++++++++++------- .../generate_schemas/generate-tpch.py | 103 +++++++++----- 3 files changed, 152 insertions(+), 87 deletions(-) diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md index cb8d9697df1a..a7f782d97ac2 100644 --- a/testing/trino-benchto-benchmarks/README.md +++ b/testing/trino-benchto-benchmarks/README.md @@ -66,7 +66,7 @@ macros: * Make sure you have configured [Presto TPC-H connector](https://trino.io/docs/current/connector/tpch.html). * Bootstrap benchmark data: ```bash - python presto-benchto-benchmarks/generate_schemas/generate-tpch.py | presto-cli-[version]-executable.jar --server [presto_coordinator-url]:[port] + testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py --factors sf1 --formats orc | trino-cli-[version]-executable.jar --server [trino_coordinator-url]:[port] ``` ### Configuring overrides file diff --git a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py index 5638308ce700..0008f062b433 100755 --- a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py +++ b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpcds.py @@ -1,53 +1,85 @@ #!/usr/bin/env python -schemas = [ - # (new_schema, source_schema) - ('tpcds_sf10_orc', 'tpcds.sf10'), - ('tpcds_sf30_orc', 'tpcds.sf30'), - ('tpcds_sf100_orc', 'tpcds.sf100'), - ('tpcds_sf300_orc', 'tpcds.sf300'), - ('tpcds_sf1000_orc', 'tpcds.sf1000'), - ('tpcds_sf3000_orc', 'tpcds.sf3000'), - ('tpcds_sf10000_orc', 'tpcds.sf10000'), -] - -tables = [ - 'call_center', - 'catalog_page', - 'catalog_returns', - 'catalog_sales', - 'customer', - 'customer_address', - 'customer_demographics', - 'date_dim', - 'household_demographics', - 'income_band', - 'inventory', - 'item', - 'promotion', - 'reason', - 'ship_mode', - 'store', - 'store_returns', - 'store_sales', - 'time_dim', - 'warehouse', - 'web_page', - 'web_returns', - 'web_sales', - 'web_site', -] - -for (new_schema, source_schema) in schemas: - - if new_schema.endswith('_orc'): - format = 'ORC' - elif new_schema.endswith('_text'): - format = 'TEXTFILE' - else: - raise ValueError(new_schema) - - print('CREATE SCHEMA hive.{};'.format(new_schema,)) - for table in tables: - print('CREATE TABLE "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format( - new_schema, table, format, source_schema, table)) +import argparse + + +def generate(factors, formats, tables): + for format in formats: + for factor in factors: + new_schema = "tpcds_" + factor + "_" + format + source_schema = "tpcds." + factor + print( + "CREATE SCHEMA IF NOT EXISTS hive.{};".format( + new_schema, + ) + ) + for table in tables: + print( + 'CREATE TABLE IF NOT EXISTS "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format( + new_schema, table, format, source_schema, table + ) + ) + + +def main(): + parser = argparse.ArgumentParser(description="Generate test data.") + parser.add_argument( + "--factors", + type=csvtype( + ["tiny", "sf1", "sf10", "sf30", "sf100", "sf300", "sf1000", "sf3000", "sf10000"] + ), + default=["sf10", "sf30", "sf100", "sf300", "sf1000", "sf3000", "sf10000"], + ) + parser.add_argument("--formats", type=csvtype(["orc", "text"]), default=["orc"]) + default_tables = [ + "call_center", + "catalog_page", + "catalog_returns", + "catalog_sales", + "customer", + "customer_address", + "customer_demographics", + "date_dim", + "household_demographics", + "income_band", + "inventory", + "item", + "promotion", + "reason", + "ship_mode", + "store", + "store_returns", + "store_sales", + "time_dim", + "warehouse", + "web_page", + "web_returns", + "web_sales", + "web_site", + ] + parser.add_argument( + "--tables", type=csvtype(default_tables), default=default_tables + ) + args = parser.parse_args() + generate(args.factors, args.formats, args.tables) + + +def csvtype(choices): + """Return a function that splits and checks comma-separated values.""" + + def splitarg(arg): + values = arg.split(",") + for value in values: + if value not in choices: + raise argparse.ArgumentTypeError( + "invalid choice: {!r} (choose from {})".format( + value, ", ".join(map(repr, choices)) + ) + ) + return values + + return splitarg + + +if __name__ == "__main__": + main() diff --git a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py index e843ef7865e1..bf092de4a897 100755 --- a/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py +++ b/testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py @@ -1,37 +1,70 @@ #!/usr/bin/env python -schemas = [ - # (new_schema, source_schema) - ('tpch_sf300_orc', 'tpch.sf300'), - ('tpch_sf1000_orc', 'tpch.sf1000'), - ('tpch_sf3000_orc', 'tpch.sf3000'), - - ('tpch_sf300_text', 'hive.tpch_sf300_orc'), - ('tpch_sf1000_text', 'hive.tpch_sf1000_orc'), - ('tpch_sf3000_text', 'hive.tpch_sf3000_orc'), -] - -tables = [ - 'customer', - 'lineitem', - 'nation', - 'orders', - 'part', - 'partsupp', - 'region', - 'supplier', -] - -for (new_schema, source_schema) in schemas: - - if new_schema.endswith('_orc'): - format = 'ORC' - elif new_schema.endswith('_text'): - format = 'TEXTFILE' - else: - raise ValueError(new_schema) - - print('CREATE SCHEMA hive.{};'.format(new_schema,)) - for table in tables: - print('CREATE TABLE "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format( - new_schema, table, format, source_schema, table)) +import argparse + + +def generate(factors, formats, tables): + for format in formats: + for factor in factors: + new_schema = "tpch_" + factor + "_" + format + source_schema = "tpch." + factor + print( + "CREATE SCHEMA IF NOT EXISTS hive.{};".format( + new_schema, + ) + ) + for table in tables: + print( + 'CREATE TABLE IF NOT EXISTS "hive"."{}"."{}" WITH (format = \'{}\') AS SELECT * FROM {}."{}";'.format( + new_schema, table, format, source_schema, table + ) + ) + + +def main(): + parser = argparse.ArgumentParser(description="Generate test data.") + parser.add_argument( + "--factors", + type=csvtype(["tiny", "sf1", "sf100", "sf300", "sf1000", "sf3000"]), + default=["sf300", "sf1000", "sf3000"], + ) + parser.add_argument( + "--formats", type=csvtype(["orc", "text"]), default=["orc", "text"] + ) + default_tables = [ + "customer", + "lineitem", + "nation", + "orders", + "part", + "partsupp", + "region", + "supplier", + ] + parser.add_argument( + "--tables", type=csvtype(default_tables), default=default_tables + ) + + args = parser.parse_args() + generate(args.factors, args.formats, args.tables) + + +def csvtype(choices): + """Return a function that splits and checks comma-separated values.""" + + def splitarg(arg): + values = arg.split(",") + for value in values: + if value not in choices: + raise argparse.ArgumentTypeError( + "invalid choice: {!r} (choose from {})".format( + value, ", ".join(map(repr, choices)) + ) + ) + return values + + return splitarg + + +if __name__ == "__main__": + main() From 4dca72e31d60bc14e0dbd498483fa4a2b36dfbe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=9Alizak?= Date: Fri, 7 Oct 2022 22:28:35 +0200 Subject: [PATCH 2/4] Rename Presto to Trino in README --- testing/trino-benchto-benchmarks/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md index a7f782d97ac2..94cbf8bb2953 100644 --- a/testing/trino-benchto-benchmarks/README.md +++ b/testing/trino-benchto-benchmarks/README.md @@ -1,14 +1,14 @@ -# Presto Benchto benchmarks +# Trino Benchto benchmarks The Benchto benchmarks utilize [Benchto](https://github.com/trinodb/benchto) benchmarking -utility to do macro benchmarking of Presto. As opposed to micro benchmarking which exercises -a class or a small, coherent set of classes, macro benchmarks done with Benchto use Presto -end-to-end, by accessing it through its API (usually with `presto-jdbc`), executing queries, +utility to do macro benchmarking of Trino. As opposed to micro benchmarking which exercises +a class or a small, coherent set of classes, macro benchmarks done with Benchto use Trino +end-to-end, by accessing it through its API (usually with `trino-jdbc`), executing queries, measuring time and gathering various metrics. ## Benchmarking suites -Even though benchmarks exercise Presto end-to-end, a single benchmark cannot use all Presto +Even though benchmarks exercise Trino end-to-end, a single benchmark cannot use all Trino features. Therefore benchmarks are organized in suites, like: * *tpch* - queries closely following the [TPC-H](http://www.tpc.org/tpch/) benchmark @@ -18,7 +18,7 @@ features. Therefore benchmarks are organized in suites, like: ### Requirements -* Presto already installed on the target environment +* Trino already installed on the target environment * Basic understanding of Benchto [components and architecture](https://github.com/trinodb/benchto) * Benchto service [configured and running](https://github.com/trinodb/benchto/tree/master/benchto-service) * An environment [defined in Benchto service](https://github.com/trinodb/benchto/tree/master/benchto-service#creating-environment) @@ -27,10 +27,10 @@ features. Therefore benchmarks are organized in suites, like: Benchto driver needs to know two things: what benchmark is to be run and what environment it is to be run on. For the purpose of the following example, we will use `tpch` benchmark -and Presto server running at `localhost:8080`, with Benchto service running at `localhost:8081`. +and Trino server running at `localhost:8080`, with Benchto service running at `localhost:8081`. Benchto driver uses Spring Boot to locate environment configuration file, so to pass the -configuration. To continue with our example, one needs to place an `application-presto-devenv.yaml` +configuration. To continue with our example, one needs to place an `application-trino-devenv.yaml` file in the current directory (i.e. the directory from which the benchmark will be invoked), with the following content: @@ -63,7 +63,7 @@ macros: ### Bootstrapping benchmark data -* Make sure you have configured [Presto TPC-H connector](https://trino.io/docs/current/connector/tpch.html). +* Make sure you have configured [Trino TPC-H connector](https://trino.io/docs/current/connector/tpch.html). * Bootstrap benchmark data: ```bash testing/trino-benchto-benchmarks/generate_schemas/generate-tpch.py --factors sf1 --formats orc | trino-cli-[version]-executable.jar --server [trino_coordinator-url]:[port] @@ -88,6 +88,6 @@ With the scene set up as in the previous section, the benchmark can be run with: java -Xmx1g -jar trino-benchto-benchmarks/target/trino-benchto-benchmarks-*-executable.jar \ --sql trino-benchto-benchmarks/src/main/resources/sql \ --benchmarks trino-benchto-benchmarks/src/main/resources/benchmarks \ - --activeBenchmarks=presto/tpch --profile=presto-devenv \ + --activeBenchmarks=trino/tpch --profile=trino-devenv \ --overrides overrides.yaml ``` From 8e51f89b66a68dba4aa61c449cce11685ba4b578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=9Alizak?= Date: Fri, 7 Oct 2022 15:24:07 +0200 Subject: [PATCH 3/4] Additional updates to README --- testing/trino-benchto-benchmarks/README.md | 25 ++++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md index 94cbf8bb2953..5c167e9b05ec 100644 --- a/testing/trino-benchto-benchmarks/README.md +++ b/testing/trino-benchto-benchmarks/README.md @@ -30,11 +30,15 @@ it is to be run on. For the purpose of the following example, we will use `tpch` and Trino server running at `localhost:8080`, with Benchto service running at `localhost:8081`. Benchto driver uses Spring Boot to locate environment configuration file, so to pass the -configuration. To continue with our example, one needs to place an `application-trino-devenv.yaml` +configuration. To continue with our example, one needs to place an `application.yaml` file in the current directory (i.e. the directory from which the benchmark will be invoked), with the following content: ```yaml +benchmarks: src/main/resources/benchmarks +sql: sql/main/resources/sql +query-results-dir: target/results + benchmark-service: url: http://localhost:8081 @@ -42,7 +46,6 @@ data-sources: trino: url: jdbc:trino://localhost:8080 username: na - password: na driver-class-name: io.trino.jdbc.TrinoDriver environment: @@ -50,6 +53,7 @@ environment: presto: url: http://localhost:8080 + username: na benchmark: feature: @@ -77,17 +81,20 @@ runs or different underlying schemas. Create a simple `overrides.yaml` file: ```yaml runs: 10 -tpch_medium: tpcds_10gb_txt +tpch_300: tpch_sf1_orc +scale_300: 1 +tpch_1000: tpch_sf1_orc +scale_1000: 1 +tpch_3000: tpch_sf1_orc +scale_3000: 1 +prefix: "" ``` ### Running benchto-driver With the scene set up as in the previous section, the benchmark can be run with: ```bash -./mvnw clean package -pl :trino-benchto-benchmarks -java -Xmx1g -jar trino-benchto-benchmarks/target/trino-benchto-benchmarks-*-executable.jar \ - --sql trino-benchto-benchmarks/src/main/resources/sql \ - --benchmarks trino-benchto-benchmarks/src/main/resources/benchmarks \ - --activeBenchmarks=trino/tpch --profile=trino-devenv \ - --overrides overrides.yaml +java -jar "$HOME/.m2/repository/io/trino/benchto/benchto-driver/0.18/benchto-driver-0.18.jar" \ + --activeBenchmarks=trino/tpch \ + --overrides "overrides.yaml" ``` From 78c5ef4bd2b084b8ba6c2097903ddca506b7b716 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=9Alizak?= Date: Fri, 7 Oct 2022 22:36:53 +0200 Subject: [PATCH 4/4] Update to Benchto which is compatible with JDK 17 --- pom.xml | 2 +- testing/trino-benchto-benchmarks/README.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index ebedf7c979a1..2b24c608ab65 100644 --- a/pom.xml +++ b/pom.xml @@ -656,7 +656,7 @@ io.trino.benchto benchto-driver - 0.19 + 0.20 diff --git a/testing/trino-benchto-benchmarks/README.md b/testing/trino-benchto-benchmarks/README.md index 5c167e9b05ec..1a3f47f0b6c5 100644 --- a/testing/trino-benchto-benchmarks/README.md +++ b/testing/trino-benchto-benchmarks/README.md @@ -94,7 +94,8 @@ prefix: "" With the scene set up as in the previous section, the benchmark can be run with: ```bash -java -jar "$HOME/.m2/repository/io/trino/benchto/benchto-driver/0.18/benchto-driver-0.18.jar" \ +./mvnw clean package -pl :trino-benchto-benchmarks +java -jar "$HOME/.m2/repository/io/trino/benchto/benchto-driver/0.20/benchto-driver-0.20-exec.jar" \ --activeBenchmarks=trino/tpch \ --overrides "overrides.yaml" ```