diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index eda117cdc7a..6407d19b69d 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -3,6 +3,11 @@ name: benchmarks on: workflow_dispatch: inputs: + runStandalone: + description: 'Run the benchmarks against standalone APM Server with Moxy' + required: false + type: boolean + default: false profile: description: 'The system profile used to run the benchmarks' required: false @@ -25,6 +30,7 @@ on: env: PNG_REPORT_FILE: out.png + BENCHMARK_CPU_OUT: default.pgo BENCHMARK_RESULT: benchmark-result.txt WORKING_DIRECTORY: testing/benchmark @@ -38,12 +44,14 @@ jobs: run: working-directory: ${{ env.WORKING_DIRECTORY }} permissions: - contents: read + contents: write id-token: write + pull-requests: write env: SSH_KEY: ./id_rsa_terraform TF_VAR_private_key: ./id_rsa_terraform TF_VAR_public_key: ./id_rsa_terraform.pub + TF_VAR_run_standalone: ${{ inputs.runStandalone }} TFVARS_SOURCE: ${{ inputs.profile || 'system-profiles/8GBx1zone.tfvars' }} # // Default to use an 8gb profile TF_VAR_BUILD_ID: ${{ github.run_id }} TF_VAR_ENVIRONMENT: ci @@ -52,6 +60,10 @@ jobs: GOBENCH_PASSWORD: ${{ secrets.GOBENCH_PASSWORD }} GOBENCH_USERNAME: ${{ secrets.GOBENCH_USERNAME }} GOBENCH_HOST: ${{ secrets.GOBENCH_HOST }} + # temporarily override to get faster feedback + BENCHMARK_WARMUP_TIME: 1m + BENCHMARK_COUNT: 2 + BENCHMARK_TIME: 1m steps: - uses: actions/checkout@v4 @@ -104,8 +116,14 @@ jobs: - name: Build apmbench run: make apmbench $SSH_KEY terraform.tfvars + - name: Build APM Server and Moxy + if: ${{ inputs.runStandalone }} + run: | + make moxy + make apm-server + - name: Override docker committed version - if: ${{ ! inputs.runOnStable }} + if: ${{ ! inputs.runOnStable && ! inputs.runStandalone}} run: make docker-override-committed-version - name: Spin up benchmark environment @@ -118,13 +136,24 @@ jobs: - name: Run benchmarks autotuned if: ${{ inputs.benchmarkAgents == '' }} - run: make run-benchmark-autotuned index-benchmark-results + run: make run-benchmark-autotuned - name: Run benchmarks self tuned if: ${{ inputs.benchmarkAgents != '' }} - run: make run-benchmark index-benchmark-results + run: make run-benchmark + + - name: Cat standalone server logs + if: ${{ inputs.runStandalone && failure() }} + run: make cat-apm-server-logs + + # Results are only indexed and uploaded if the run happens on the main branch. + + - name: Index benchmarks result + # if: github.ref == 'refs/heads/main' + run: make index-benchmark-results - name: Download PNG + # if: github.ref == 'refs/heads/main' run: >- ${{ github.workspace }}/.ci/scripts/download-png-from-kibana.sh ${{ secrets.KIBANA_BENCH_ENDPOINT }} @@ -133,6 +162,7 @@ jobs: $PNG_REPORT_FILE - name: Upload PNG + # if: github.ref == 'refs/heads/main' uses: actions/upload-artifact@v4 with: name: kibana-png-report @@ -140,6 +170,7 @@ jobs: if-no-files-found: error - name: Upload PNG to AWS S3 + # if: github.ref == 'refs/heads/main' id: s3-upload-png env: AWS_DEFAULT_REGION: us-east-1 @@ -149,13 +180,49 @@ jobs: echo "png_report_url=https://elastic-apm-server-benchmark-reports.s3.amazonaws.com/${DEST_NAME}" >> "$GITHUB_OUTPUT" - name: Upload benchmark result + # if: github.ref == 'refs/heads/main' uses: actions/upload-artifact@v4 - if: always() with: name: benchmark-result path: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_RESULT }} if-no-files-found: error + # The next section injects CPU profile collected by apmbench into the build. + # By copying the profile, uploading it to the artifacts and pushing it + # via a PR to update default.pgo. + + - name: Copy CPU profile + if: ${{ inputs.runStandalone }} + run: make cp-cpuprof + + - name: Upload CPU profile + if: ${{ inputs.runStandalone }} + uses: actions/upload-artifact@v4 + with: + name: cpu-profile + path: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_CPU_OUT }} + if-no-files-found: error + + - name: Open PGO PR + if: ${{ inputs.runStandalone }} + run: | + cd "${{ github.workspace }}" + mv "$PROFILE_PATH" x-pack/apm-server/default.pgo + git config user.email "apm@elastic.co" + git config user.name "APM Server" + git fetch origin main + git checkout main + BRANCH="update-pgo-$(date +%s)" + git checkout -b "$BRANCH" + git add x-pack/apm-server/default.pgo + git commit -m "PGO: Update default.pgo from benchmarks $WORKFLOW." + git push -u origin "$BRANCH" + gh pr create -B main -H "$BRANCH" -t "PGO: Update default.pgo" -b "Update default.pgo CPU profile from the benchmarks [workflow]($WORKFLOW)." -R elastic/apm-server + env: + PROFILE_PATH: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_CPU_OUT }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + WORKFLOW: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }} + - name: Tear down benchmark environment if: always() run: make destroy diff --git a/systemtest/benchtest/profiles.go b/systemtest/benchtest/profiles.go index 9e2ee89b43a..e3dd240df44 100644 --- a/systemtest/benchtest/profiles.go +++ b/systemtest/benchtest/profiles.go @@ -88,7 +88,7 @@ func (p *profiles) recordCPU() error { if benchConfig.CPUProfile == "" { return nil } - duration := 2 * benchConfig.Benchtime + duration := benchConfig.Benchtime profile, err := fetchProfile("/debug/pprof/profile", duration) if err != nil { return fmt.Errorf("failed to fetch CPU profile: %w", err) diff --git a/systemtest/cmd/moxy/.gitignore b/systemtest/cmd/moxy/.gitignore new file mode 100644 index 00000000000..48537ea034d --- /dev/null +++ b/systemtest/cmd/moxy/.gitignore @@ -0,0 +1 @@ +moxy diff --git a/systemtest/cmd/moxy/go.mod b/systemtest/cmd/moxy/go.mod new file mode 100644 index 00000000000..66d6f326211 --- /dev/null +++ b/systemtest/cmd/moxy/go.mod @@ -0,0 +1,10 @@ +module moxy + +go 1.22.5 + +require ( + github.com/klauspost/compress v1.17.9 + go.uber.org/zap v1.27.0 +) + +require go.uber.org/multierr v1.10.0 // indirect diff --git a/systemtest/cmd/moxy/go.sum b/systemtest/cmd/moxy/go.sum new file mode 100644 index 00000000000..ba30898e48e --- /dev/null +++ b/systemtest/cmd/moxy/go.sum @@ -0,0 +1,16 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ= +go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/systemtest/cmd/moxy/main.go b/systemtest/cmd/moxy/main.go new file mode 100644 index 00000000000..1c026b557f8 --- /dev/null +++ b/systemtest/cmd/moxy/main.go @@ -0,0 +1,154 @@ +package main + +import ( + "bufio" + "bytes" + "encoding/base64" + "flag" + "fmt" + "io" + "net/http" + "sync" + + "github.com/klauspost/compress/gzip" + "github.com/klauspost/compress/zstd" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +var memPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Buffer) + }, +} + +func main() { + logLevel := zap.LevelFlag( + "loglevel", zapcore.InfoLevel, + "set log level to one of: DEBUG, INFO (default), WARN, ERROR, DPANIC, PANIC, FATAL", + ) + username := flag.String("username", "elastic", "authentication username to mimic ES") + password := flag.String("password", "", "authentication username to mimic ES") + flag.Parse() + zapcfg := zap.NewProductionConfig() + zapcfg.EncoderConfig.EncodeTime = zapcore.RFC3339TimeEncoder + zapcfg.EncoderConfig.EncodeLevel = zapcore.CapitalColorLevelEncoder + zapcfg.Encoding = "console" + zapcfg.Level = zap.NewAtomicLevelAt(*logLevel) + logger, err := zapcfg.Build() + if err != nil { + panic(err) + } + if *username == "" || *password == "" { + logger.Fatal("both username and password are required") + } + defer logger.Sync() + s := http.Server{ + Addr: ":9200", + Handler: handler(logger, *username, *password), + } + if err := s.ListenAndServe(); err != nil { + logger.Fatal("listen error", zap.Error(err)) + } +} + +func handler(logger *zap.Logger, username, password string) http.Handler { + expectedAuth := fmt.Sprintf("%s:%s", username, password) + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-Elastic-Product", "Elasticsearch") + switch r.URL.Path { + case "/": + w.Write([]byte(`{ + "name": "instance-0000000001", + "cluster_name": "eca3b3c3bbee4816bb92f82184e328dd", + "cluster_uuid": "cc49813b6b8e2138fbb8243ae2b3deed", + "version": { + "number": "8.15.1", + "build_flavor": "default", + "build_type": "docker", + "build_hash": "253e8544a65ad44581194068936f2a5d57c2c051", + "build_date": "2024-09-02T22:04:47.310170297Z", + "build_snapshot": false, + "lucene_version": "9.11.1", + "minimum_wire_compatibility_version": "7.17.0", + "minimum_index_compatibility_version": "7.0.0" + }, + "tagline": "You Know, for Search" + }`)) + return + case "/_security/user/_has_privileges": + w.Write([]byte(`{"username":"admin","has_all_requested":true,"cluster":{},"index":{},"application":{"apm":{"-":{"event:write":true}}}}`)) + case "/_bulk": + auth := r.Header.Get("Authorization") + actualAuth, err := base64.StdEncoding.DecodeString(auth) + if err != nil || string(actualAuth) != expectedAuth { + logger.Error( + "authentication failed", + zap.Error(err), + zap.String("actual", string(actualAuth)), + zap.String("expected", expectedAuth), + ) + w.WriteHeader(http.StatusUnauthorized) + return + } + + first := true + var body io.Reader + switch r.Header.Get("Content-Encoding") { + case "gzip": + r, err := gzip.NewReader(r.Body) + if err != nil { + logger.Error("gzip reader err", zap.Error(err)) + http.Error(w, fmt.Sprintf("reader error: %v", err), http.StatusInternalServerError) + return + } + defer r.Close() + body = r + case "zstd": + r, err := zstd.NewReader(r.Body) + if err != nil { + logger.Error("zstd reader err", zap.Error(err)) + http.Error(w, fmt.Sprintf("reader error: %v", err), http.StatusInternalServerError) + return + } + defer r.Close() + body = r + default: + body = r.Body + } + + jsonw := memPool.Get().(*bytes.Buffer) + defer func() { + jsonw.Reset() + memPool.Put(jsonw) + }() + + jsonw.Write([]byte(`{"items":[`)) + scanner := bufio.NewScanner(body) + for scanner.Scan() { + // Action is always "create", skip decoding. + if !scanner.Scan() { + logger.Error("unexpected payload") + http.Error(w, "expected source", http.StatusInternalServerError) + return + } + if first { + first = false + } else { + jsonw.WriteByte(',') + } + jsonw.Write([]byte(`{"create":{"status":201}}`)) + } + if err := scanner.Err(); err != nil { + logger.Error("scanner error", zap.Error(err)) + http.Error(w, fmt.Sprintf("scanner error: %v", err), http.StatusInternalServerError) + } else { + jsonw.Write([]byte(`]}`)) + w.Write(jsonw.Bytes()) + } + // TODO additionally report events throughput metric here, to index into benchmarks. + default: + logger.Error("unknown path", zap.String("path", r.URL.Path)) + } + }) +} diff --git a/testing/benchmark/Makefile b/testing/benchmark/Makefile index cf894325b58..0c034355375 100644 --- a/testing/benchmark/Makefile +++ b/testing/benchmark/Makefile @@ -2,6 +2,15 @@ APMBENCH_PATH ?= ../../systemtest/cmd/apmbench APMBENCH_GOOS ?= linux APMBENCH_GOARCH ?= amd64 +MOXY_PATH ?= ../../systemtest/cmd/moxy +MOXY_GOOS ?= linux +MOXY_GOARCH ?= amd64 + +APM_SERVER_GOOS ?= linux +APM_SERVER_GOARCH ?= amd64 + +RUN_STANDALONE ?= false + TFVARS_SOURCE ?= terraform.tfvars.example BENCHMARK_WARMUP_TIME ?= 5m @@ -23,6 +32,7 @@ SSH_USER ?= ec2-user SSH_OPTS ?= -o LogLevel=ERROR -o StrictHostKeyChecking=no -o ServerAliveInterval=60 -o ServerAliveCountMax=10 SSH_KEY ?= ~/.ssh/id_rsa_terraform WORKER_IP = $(shell terraform output -raw public_ip) +APM_SERVER_IP = $(shell terraform output -raw apm_server_ip) SHELL = /bin/bash .SHELLFLAGS = -o pipefail -c @@ -67,6 +77,15 @@ apmbench: @echo "-> Building apmbench..." @cd $(APMBENCH_PATH) && CGO_ENABLED=0 GOOS=$(APMBENCH_GOOS) GOARCH=$(APMBENCH_GOARCH) go build . +.PHONY: moxy +moxy: + @echo "-> Building moxy..." + @cd $(MOXY_PATH) && CGO_ENABLED=0 GOOS=$(MOXY_GOOS) GOARCH=$(MOXY_GOARCH) go build . + +.PHONY: apm-server +apm-server: + @cd ../.. && make build/apm-server-$(APM_SERVER_GOOS)-$(APM_SERVER_GOARCH) && mv build/apm-server-$(APM_SERVER_GOOS)-$(APM_SERVER_GOARCH) build/apm-server + .PHONY: init init: @terraform init @@ -110,6 +129,12 @@ index-benchmark-results: _default-gobench-vars .PHONY: _default-gobench-vars _default-gobench-vars: +ifeq ($(RUN_STANDALONE),true) + $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_size=$(shell echo var.standalone_apm_server_instance_size | terraform console | tr -d '"')) + $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),moxy_size=$(shell echo var.standalone_moxy_instance_size | terraform console | tr -d '"')) + $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),build_sha=$(shell git rev-parse HEAD)) + $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),bench_mode=standalone) +else # TODO(marclop) Update code below to use a foor loop, rather than copying the lines. $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_size=$(shell echo var.apm_server_size | terraform console | tr -d '"')) $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),elasticsearch_size=$(shell echo var.elasticsearch_size | terraform console | tr -d '"')) @@ -117,6 +142,12 @@ _default-gobench-vars: $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_zone_count=$(shell echo var.apm_server_zone_count | terraform console | tr -d '"')) $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),elasticsearch_zone_count=$(shell echo var.elasticsearch_zone_count | terraform console | tr -d '"')) $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),build_sha=$(shell curl -sL -H "Authorization: Bearer $(shell terraform output -raw apm_secret_token )" $(shell terraform output -raw apm_server_url ) | jq -r '.build_sha')) + $(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),bench_mode=cloud) +endif + +.PHONY: cat-apm-server-logs +cat-apm-server-logs: + @ssh $(SSH_OPTS) -i $(SSH_KEY) $(SSH_USER)@$(APM_SERVER_IP) "cat /var/log/apm-server/*" $(SSH_KEY): @ssh-keygen -t rsa -b 4096 -C "$(USER)@elastic.co" -N "" -f $(SSH_KEY) diff --git a/testing/benchmark/main.tf b/testing/benchmark/main.tf index e699f56485e..984ae7375c7 100644 --- a/testing/benchmark/main.tf +++ b/testing/benchmark/main.tf @@ -45,7 +45,46 @@ locals { name_prefix = "${coalesce(var.user_name, "unknown-user")}-bench" } +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "3.14.0" + + name = "${var.user_name}-worker" + cidr = var.vpc_cidr + + azs = [for letter in ["a", "b", "c"] : "${var.worker_region}${letter}"] + public_subnets = var.public_cidr + enable_ipv6 = false + enable_nat_gateway = false + single_nat_gateway = false + + manage_default_security_group = true + default_security_group_ingress = [ + { + "from_port" : 0, + "to_port" : 0, + "protocol" : -1, + "self" : true, + "cidr_blocks" : "0.0.0.0/0", + } + ] + default_security_group_egress = [ + { + "from_port" : 0, + "to_port" : 0, + "protocol" : -1, + "cidr_blocks" : "0.0.0.0/0", + } + ] + + tags = merge(local.ci_tags, module.tags.tags) + vpc_tags = { + Name = "vpc-${var.user_name}-worker" + } +} + module "ec_deployment" { + count = var.run_standalone ? 0 : 1 source = "../infra/terraform/modules/ec_deployment" region = var.ess_region @@ -73,12 +112,13 @@ module "ec_deployment" { module "benchmark_worker" { source = "../infra/terraform/modules/benchmark_executor" - region = var.worker_region + vpc_id = module.vpc.vpc_id + region = var.worker_region user_name = var.user_name - apm_server_url = module.ec_deployment.apm_url - apm_secret_token = module.ec_deployment.apm_secret_token + apm_server_url = var.run_standalone ? module.standalone_apm_server[0].apm_server_url : module.ec_deployment[0].apm_url + apm_secret_token = var.run_standalone ? module.standalone_apm_server[0].apm_secret_token : module.ec_deployment[0].apm_secret_token apmbench_bin_path = var.apmbench_bin_path instance_type = var.worker_instance_type @@ -86,5 +126,41 @@ module "benchmark_worker" { public_key = var.public_key private_key = var.private_key - tags = merge(local.ci_tags, module.tags.tags) + tags = merge(local.ci_tags, module.tags.tags) + depends_on = [module.moxy, module.ec_deployment] +} + +module "moxy" { + count = var.run_standalone ? 1 : 0 + source = "../infra/terraform/modules/moxy" + + vpc_id = module.vpc.vpc_id + instance_type = var.standalone_moxy_instance_size + moxy_bin_path = var.moxy_bin_path + + aws_provisioner_key_name = var.private_key + + tags = merge(local.ci_tags, module.tags.tags) + depends_on = [module.vpc] +} + + +module "standalone_apm_server" { + count = var.run_standalone ? 1 : 0 + source = "../infra/terraform/modules/standalone_apm_server" + + vpc_id = module.vpc.vpc_id + aws_os = "amzn2-ami-hvm-*-x86_64-ebs" + apm_instance_type = var.standalone_apm_server_instance_size + apm_server_bin_path = var.apm_server_bin_path + ea_managed = false + + aws_provisioner_key_name = var.private_key + + elasticsearch_url = module.moxy[0].moxy_url + elasticsearch_username = "elastic" + elasticsearch_password = module.moxy[0].moxy_password + + tags = merge(local.ci_tags, module.tags.tags) + depends_on = [module.moxy] } diff --git a/testing/benchmark/outputs.tf b/testing/benchmark/outputs.tf index 371d9c43760..247ad4fa2df 100644 --- a/testing/benchmark/outputs.tf +++ b/testing/benchmark/outputs.tf @@ -4,38 +4,46 @@ output "public_ip" { } output "elasticsearch_url" { - value = module.ec_deployment.elasticsearch_url + value = !var.run_standalone ? module.ec_deployment[0].elasticsearch_url : "" description = "The secure Elasticsearch URL" } output "elasticsearch_username" { - value = module.ec_deployment.elasticsearch_username + value = !var.run_standalone ? module.ec_deployment[0].elasticsearch_username : "" description = "The Elasticsearch username" sensitive = true } output "elasticsearch_password" { - value = module.ec_deployment.elasticsearch_password + value = !var.run_standalone ? module.ec_deployment[0].elasticsearch_password : "" description = "The Elasticsearch password" sensitive = true } output "kibana_url" { - value = module.ec_deployment.kibana_url + value = !var.run_standalone ? module.ec_deployment[0].kibana_url : "" description = "The secure Kibana URL" } + output "apm_secret_token" { - value = module.ec_deployment.apm_secret_token + value = var.run_standalone ? module.standalone_apm_server[0].apm_secret_token : module.ec_deployment[0].apm_secret_token description = "The APM Server secret token" sensitive = true } output "apm_server_url" { - value = module.ec_deployment.apm_url + value = var.run_standalone ? module.standalone_apm_server[0].apm_server_url : module.ec_deployment[0].apm_url description = "The APM Server URL" + sensitive = true +} + +output "apm_server_ip" { + value = var.run_standalone ? module.standalone_apm_server[0].apm_server_ip : "" + description = "The APM Server EC2 IP address" + sensitive = true } output "admin_console_url" { - value = module.ec_deployment.admin_console_url + value = !var.run_standalone ? module.ec_deployment[0].admin_console_url : "" description = "The admin console URL" } diff --git a/testing/benchmark/system-profiles/16GBx2zone.tfvars b/testing/benchmark/system-profiles/16GBx2zone.tfvars index d081604f332..e6ba492fa43 100644 --- a/testing/benchmark/system-profiles/16GBx2zone.tfvars +++ b/testing/benchmark/system-profiles/16GBx2zone.tfvars @@ -1,5 +1,11 @@ user_name = "USER" +# APM bench + +worker_instance_type = "c6i.2xlarge" + +# Elastic Cloud + # The number of AZs the APM Server should span. apm_server_zone_count = 1 # The Elasticsearch cluster node size. @@ -10,5 +16,8 @@ elasticsearch_zone_count = 2 apm_server_size = "16g" # Number of shards for the ES indices apm_shards = 4 -# Benchmarks executor size executor -worker_instance_type = "c6i.2xlarge" + +# Standalone + +standalone_apm_server_instance_size = "c6i.4xlarge" +standalone_moxy_instance_size = "c6i.8xlarge" diff --git a/testing/benchmark/system-profiles/1GBx1zone.tfvars b/testing/benchmark/system-profiles/1GBx1zone.tfvars index a2ca3dac002..8b1ff546e60 100644 --- a/testing/benchmark/system-profiles/1GBx1zone.tfvars +++ b/testing/benchmark/system-profiles/1GBx1zone.tfvars @@ -1,5 +1,11 @@ user_name = "USER" +# APM bench + +worker_instance_type = "c6i.large" + +# Elastic Cloud + # The number of AZs the APM Server should span. apm_server_zone_count = 1 # The Elasticsearch cluster node size. @@ -8,3 +14,8 @@ elasticsearch_size = "16g" elasticsearch_zone_count = 2 # APM server instance size apm_server_size = "1g" + +# Standalone + +standalone_apm_server_instance_size = "c6i.large" +standalone_moxy_instance_size = "c6i.xlarge" diff --git a/testing/benchmark/system-profiles/2GBx1zone.tfvars b/testing/benchmark/system-profiles/2GBx1zone.tfvars index 668f12f9edf..a3114b4b989 100644 --- a/testing/benchmark/system-profiles/2GBx1zone.tfvars +++ b/testing/benchmark/system-profiles/2GBx1zone.tfvars @@ -1,5 +1,11 @@ user_name = "USER" +# APM bench + +worker_instance_type = "c6i.large" + +# Elastic Cloud + # The number of AZs the APM Server should span. apm_server_zone_count = 1 # The Elasticsearch cluster node size. @@ -7,4 +13,9 @@ elasticsearch_size = "16g" # The number of AZs the Elasticsearch cluster should have. elasticsearch_zone_count = 2 # APM server instance size -apm_server_size = "2g" \ No newline at end of file +apm_server_size = "2g" + +# Standalone + +standalone_apm_server_instance_size = "c6i.large" +standalone_moxy_instance_size = "c6i.xlarge" diff --git a/testing/benchmark/system-profiles/32GBx2zone.tfvars b/testing/benchmark/system-profiles/32GBx2zone.tfvars index 10a9180257b..7735b9f7695 100644 --- a/testing/benchmark/system-profiles/32GBx2zone.tfvars +++ b/testing/benchmark/system-profiles/32GBx2zone.tfvars @@ -1,5 +1,11 @@ user_name = "USER" +# APM bench + +worker_instance_type = "c6i.2xlarge" + +# Elastic Cloud + # The number of AZs the APM Server should span. apm_server_zone_count = 1 # The Elasticsearch cluster node size. @@ -12,5 +18,8 @@ elasticsearch_dedicated_masters = true apm_server_size = "32g" # Number of shards for the ES indices apm_shards = 4 -# Benchmarks executor size executor -worker_instance_type = "c6i.2xlarge" + +# Standalone + +standalone_apm_server_instance_size = "c6i.8xlarge" +standalone_moxy_instance_size = "c6i.16xlarge" diff --git a/testing/benchmark/system-profiles/4GBx1zone.tfvars b/testing/benchmark/system-profiles/4GBx1zone.tfvars index f55f9099444..23732bb8448 100644 --- a/testing/benchmark/system-profiles/4GBx1zone.tfvars +++ b/testing/benchmark/system-profiles/4GBx1zone.tfvars @@ -1,5 +1,11 @@ user_name = "USER" +# APM bench + +worker_instance_type = "c6i.xlarge" + +# Elastic Cloud + # The number of AZs the APM Server should span. apm_server_zone_count = 1 # The Elasticsearch cluster node size. @@ -7,4 +13,9 @@ elasticsearch_size = "32g" # The number of AZs the Elasticsearch cluster should have. elasticsearch_zone_count = 2 # APM server instance size -apm_server_size = "4g" \ No newline at end of file +apm_server_size = "4g" + +# Standalone + +standalone_apm_server_instance_size = "c6i.xlarge" +standalone_moxy_instance_size = "c6i.2xlarge" diff --git a/testing/benchmark/system-profiles/8GBx1zone.tfvars b/testing/benchmark/system-profiles/8GBx1zone.tfvars index 62719a89b15..a5802b19e29 100644 --- a/testing/benchmark/system-profiles/8GBx1zone.tfvars +++ b/testing/benchmark/system-profiles/8GBx1zone.tfvars @@ -1,5 +1,11 @@ user_name = "USER" +# APM bench + +worker_instance_type = "c6i.xlarge" + +# Elastic Cloud + # The number of AZs the APM Server should span. apm_server_zone_count = 1 # The Elasticsearch cluster node size. @@ -8,5 +14,8 @@ elasticsearch_size = "64g" elasticsearch_zone_count = 2 # APM server instance size apm_server_size = "8g" -# Benchmarks executor size executor -worker_instance_type = "c6i.2xlarge" + +# Standalone + +standalone_apm_server_instance_size = "c6i.2xlarge" +standalone_moxy_instance_size = "c6i.4xlarge" diff --git a/testing/benchmark/terraform.tfvars.example b/testing/benchmark/terraform.tfvars.example index d58973b61b1..2b44a828d6c 100644 --- a/testing/benchmark/terraform.tfvars.example +++ b/testing/benchmark/terraform.tfvars.example @@ -67,3 +67,9 @@ user_name = "USER" # Override the default shard settings for APM indices. Defaults to 0, which doesn't # change the default shard settings. # apm_shards = 12 + +# Override the default APM Server VM size in standalone bench mode. +# standalone_apm_server_instance_size = "c6i.2xlarge" + +# Override the default Moxy VM size in standalone bench mode. +# standalone_moxy_instance_size = "c6i.4xlarge" \ No newline at end of file diff --git a/testing/benchmark/variables.tf b/testing/benchmark/variables.tf index deb2d05d6b0..c658e63aab2 100644 --- a/testing/benchmark/variables.tf +++ b/testing/benchmark/variables.tf @@ -5,6 +5,12 @@ variable "user_name" { type = string } +variable "run_standalone" { + default = false + description = "If set run benchmarks against standalone APM Server conneted to moxy" + type = bool +} + ## Deployment configuration variable "ess_region" { @@ -86,6 +92,48 @@ variable "drop_pipeline" { type = bool } +# Standalone + +variable "apm_server_bin_path" { + default = "../../build" + type = string + description = "Optional path to the apm-server binary" +} + +variable "moxy_bin_path" { + default = "../../systemtest/cmd/moxy" + type = string + description = "Optional path to the moxy binary" +} + +variable "standalone_apm_server_instance_size" { + default = "c6i.2xlarge" + type = string + description = "Optional instance type to use for the APM Server VM" +} + +variable "standalone_moxy_instance_size" { + default = "c6i.4xlarge" + type = string + description = "Optional instance type to use for the Moxy VM" +} + +## VPC Network settings + +variable "vpc_cidr" { + default = "192.168.44.0/24" + type = string +} + +variable "public_cidr" { + default = [ + "192.168.44.0/26", + "192.168.44.64/26", + "192.168.44.128/26", + ] + type = list(string) +} + ## Worker configuraiton variable "worker_region" { @@ -101,7 +149,7 @@ variable "apmbench_bin_path" { } variable "worker_instance_type" { - default = "c6i.large" + default = "c6i.xlarge" type = string description = "Optional instance type to use for the worker VM" } diff --git a/testing/infra/terraform/modules/benchmark_executor/instance.tf b/testing/infra/terraform/modules/benchmark_executor/instance.tf index 18d55462c66..7b41f643ebf 100644 --- a/testing/infra/terraform/modules/benchmark_executor/instance.tf +++ b/testing/infra/terraform/modules/benchmark_executor/instance.tf @@ -6,50 +6,6 @@ locals { } } -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "3.14.0" - - name = "${var.user_name}-worker" - cidr = var.vpc_cidr - - azs = [for letter in ["a", "b", "c"] : "${var.region}${letter}"] - public_subnets = var.public_cidr - enable_ipv6 = false - enable_nat_gateway = false - single_nat_gateway = false - - manage_default_security_group = true - default_security_group_ingress = [ - { - "from_port" : 0, - "to_port" : 0, - "protocol" : -1, - "self" : true, - "cidr_blocks" : "0.0.0.0/0", - } - ] - default_security_group_egress = [ - { - "from_port" : 0, - "to_port" : 0, - "protocol" : -1, - "cidr_blocks" : "0.0.0.0/0", - } - ] - - tags = merge(var.tags, local.ec2_tags) - vpc_tags = { - Name = "vpc-${var.user_name}-worker" - } -} - -resource "aws_key_pair" "worker" { - key_name = "${var.user_name}_worker_key" - public_key = file(var.public_key) - tags = merge(var.tags, local.ec2_tags) -} - data "aws_ami" "worker_ami" { owners = ["amazon"] most_recent = true @@ -60,6 +16,18 @@ data "aws_ami" "worker_ami" { } } +data "aws_subnets" "public_subnets" { + filter { + name = "vpc-id" + values = [var.vpc_id] + } +} + +data "aws_security_group" "security_group" { + vpc_id = var.vpc_id + name = "default" +} + module "ec2_instance" { source = "terraform-aws-modules/ec2-instance/aws" @@ -68,9 +36,15 @@ module "ec2_instance" { ami = data.aws_ami.worker_ami.id instance_type = var.instance_type monitoring = false - vpc_security_group_ids = [module.vpc.default_security_group_id] - subnet_id = module.vpc.public_subnets[0] + vpc_security_group_ids = [data.aws_security_group.security_group.id] + subnet_id = data.aws_subnets.public_subnets.ids[0] associate_public_ip_address = true key_name = aws_key_pair.worker.id tags = merge(var.tags, local.ec2_tags) } + +resource "aws_key_pair" "worker" { + key_name = "${var.user_name}_worker_key" + public_key = file(var.public_key) + tags = merge(var.tags, local.ec2_tags) +} diff --git a/testing/infra/terraform/modules/benchmark_executor/variables.tf b/testing/infra/terraform/modules/benchmark_executor/variables.tf index 5edc17f5bae..79e21837e5e 100644 --- a/testing/infra/terraform/modules/benchmark_executor/variables.tf +++ b/testing/infra/terraform/modules/benchmark_executor/variables.tf @@ -18,6 +18,11 @@ variable "instance_type" { description = "Optional instance type to use for the worker VM" } +variable "vpc_id" { + description = "VPC ID to provision the EC2 instance" + type = string +} + variable "apm_secret_token" { default = "" type = string @@ -39,22 +44,6 @@ variable "tags" { description = "Optional set of tags to use for all resources" } -## VPC Network settings - -variable "vpc_cidr" { - default = "192.168.44.0/24" - type = string -} - -variable "public_cidr" { - default = [ - "192.168.44.0/26", - "192.168.44.64/26", - "192.168.44.128/26", - ] - type = list(string) -} - variable "region" { default = "us-west2" type = string diff --git a/testing/infra/terraform/modules/moxy/main.tf b/testing/infra/terraform/modules/moxy/main.tf new file mode 100644 index 00000000000..0429d76d164 --- /dev/null +++ b/testing/infra/terraform/modules/moxy/main.tf @@ -0,0 +1,104 @@ +locals { + moxy_port = "9200" + bin_path = "/tmp/moxy" +} + +data "aws_ami" "worker_ami" { + owners = ["amazon"] + most_recent = true + + filter { + name = "name" + values = ["amzn2-ami-hvm-*-x86_64-ebs"] + } +} + +data "aws_subnets" "public_subnets" { + filter { + name = "vpc-id" + values = [var.vpc_id] + } +} + +resource "aws_security_group" "main" { + vpc_id = var.vpc_id + egress = [ + { + cidr_blocks = ["0.0.0.0/0", ] + description = "" + from_port = 0 + ipv6_cidr_blocks = [] + prefix_list_ids = [] + protocol = "-1" + security_groups = [] + self = false + to_port = 0 + } + ] + ingress = [ + { + cidr_blocks = ["0.0.0.0/0", ] + description = "" + from_port = 22 + ipv6_cidr_blocks = [] + prefix_list_ids = [] + protocol = "tcp" + security_groups = [] + self = false + to_port = 22 + }, + { + cidr_blocks = ["0.0.0.0/0", ] + description = "" + from_port = local.moxy_port + ipv6_cidr_blocks = [] + prefix_list_ids = [] + protocol = "tcp" + security_groups = [] + self = false + to_port = local.moxy_port + } + ] +} + +resource "aws_instance" "moxy" { + ami = data.aws_ami.worker_ami.id + instance_type = var.instance_type + subnet_id = data.aws_subnets.public_subnets.ids[0] + vpc_security_group_ids = [aws_security_group.main.id] + key_name = aws_key_pair.provisioner_key.key_name + monitoring = false + + connection { + type = "ssh" + user = "ec2-user" + host = self.public_ip + private_key = file("${var.aws_provisioner_key_name}") + } + + provisioner "file" { + source = "${var.moxy_bin_path}/moxy" + destination = local.bin_path + } + provisioner "remote-exec" { + inline = [ + "sudo cp ${local.bin_path} moxy", + "sudo chmod +x moxy", + "screen -d -m ./moxy -password=${random_password.moxy_password.result}", + "sleep 1" + ] + } + + tags = var.tags +} + +resource "aws_key_pair" "provisioner_key" { + public_key = file("${var.aws_provisioner_key_name}.pub") + tags = var.tags +} + + +resource "random_password" "moxy_password" { + length = 16 + special = false +} diff --git a/testing/infra/terraform/modules/moxy/outputs.tf b/testing/infra/terraform/modules/moxy/outputs.tf new file mode 100644 index 00000000000..89414843b6b --- /dev/null +++ b/testing/infra/terraform/modules/moxy/outputs.tf @@ -0,0 +1,10 @@ +output "moxy_url" { + value = "http://${aws_instance.moxy.public_ip}:${local.moxy_port}" + description = "The Moxy server URL" +} + +output "moxy_password" { + value = random_password.moxy_password.result + description = "The Moxy password for communication" + sensitive = true +} diff --git a/testing/infra/terraform/modules/moxy/variables.tf b/testing/infra/terraform/modules/moxy/variables.tf new file mode 100644 index 00000000000..9b4cb7a4131 --- /dev/null +++ b/testing/infra/terraform/modules/moxy/variables.tf @@ -0,0 +1,25 @@ +variable "instance_type" { + type = string + description = "Moxy instance type" +} + +variable "vpc_id" { + description = "VPC ID to provision the EC2 instance" + type = string +} + +variable "aws_provisioner_key_name" { + description = "ssh key name to create the aws key pair and remote provision the EC2 instance" + type = string +} + +variable "moxy_bin_path" { + type = string + description = "Path to moxy binary from to copy to the worker machine" +} + +variable "tags" { + type = map(string) + default = {} + description = "Optional set of tags to use for all resources" +} diff --git a/testing/infra/terraform/modules/standalone_apm_server/apm-server.yml.tftpl b/testing/infra/terraform/modules/standalone_apm_server/apm-server.yml.tftpl index 5061811d904..e51007ccf5a 100644 --- a/testing/infra/terraform/modules/standalone_apm_server/apm-server.yml.tftpl +++ b/testing/infra/terraform/modules/standalone_apm_server/apm-server.yml.tftpl @@ -6,6 +6,11 @@ apm-server: secret_token: ${apm_secret_token} rum: enabled: true + expvar: + enabled: true + pprof: + enabled: true + output: elasticsearch: hosts: [ ${elasticsearch_url} ] diff --git a/testing/infra/terraform/modules/standalone_apm_server/main.tf b/testing/infra/terraform/modules/standalone_apm_server/main.tf index 251251181b0..a43f81b36da 100644 --- a/testing/infra/terraform/modules/standalone_apm_server/main.tf +++ b/testing/infra/terraform/modules/standalone_apm_server/main.tf @@ -6,6 +6,7 @@ locals { "debian-10-arm64" = "136693071363" # debian "debian-11-arm64" = "136693071363" # debian "amzn2-ami-kernel-5.10" = "137112412989" # amazon + "amzn2-ami-hvm-*-x86_64-ebs" = "137112412989" # amazon "al2023-ami-2023" = "137112412989" # amazon "RHEL-7" = "309956199498" # Red Hat "RHEL-8" = "309956199498" # Red Hat @@ -18,6 +19,7 @@ locals { "debian-10-arm64" = "t4g.nano" "debian-11-arm64" = "t4g.nano" "amzn2-ami-kernel-5.10" = "t4g.nano" + "amzn2-ami-hvm-*-x86_64-ebs" = "t4g.nano" "al2023-ami-2023" = "t4g.nano" "RHEL-7" = "t3a.micro" # RHEL-7 doesn't support arm "RHEL-8" = "t4g.micro" # RHEL doesn't support nano instances @@ -30,6 +32,7 @@ locals { "debian-10-arm64" = "arm64" "debian-11-arm64" = "arm64" "amzn2-ami-kernel-5.10" = "arm64" + "amzn2-ami-hvm-*-x86_64-ebs" = "x86_64" "al2023-ami-2023" = "arm64" "RHEL-7" = "x86_64" # RHEL-7 doesn't support arm "RHEL-8" = "arm64" @@ -66,13 +69,16 @@ locals { "debian-10-arm64" = "admin" "debian-11-arm64" = "admin" "amzn2-ami-kernel-5.10" = "ec2-user" + "amzn2-ami-hvm-*-x86_64-ebs" = "ec2-user" "al2023-ami-2023" = "ec2-user" "RHEL-7" = "ec2-user" "RHEL-8" = "ec2-user" "RHEL-9" = "ec2-user" } + apm_port = "8200" conf_path = "/tmp/local-apm-config.yml" + bin_path = "/tmp/apm-server" } data "aws_ami" "os" { @@ -101,7 +107,15 @@ data "aws_ami" "os" { owners = [local.image_owners[var.aws_os]] } +data "aws_subnets" "public_subnets" { + filter { + name = "vpc-id" + values = [var.vpc_id] + } +} + resource "aws_security_group" "main" { + vpc_id = var.vpc_id egress = [ { cidr_blocks = ["0.0.0.0/0", ] @@ -142,9 +156,12 @@ resource "aws_security_group" "main" { } resource "aws_instance" "apm" { - ami = data.aws_ami.os.id - instance_type = local.instance_types[var.aws_os] - key_name = aws_key_pair.provisioner_key.key_name + ami = data.aws_ami.os.id + instance_type = var.apm_instance_type == "" ? local.instance_types[var.aws_os] : var.apm_instance_type + subnet_id = data.aws_subnets.public_subnets.ids[0] + vpc_security_group_ids = [aws_security_group.main.id] + key_name = aws_key_pair.provisioner_key.key_name + monitoring = false connection { type = "ssh" @@ -153,6 +170,11 @@ resource "aws_instance" "apm" { private_key = file("${var.aws_provisioner_key_name}") } + provisioner "file" { + source = "${var.apm_server_bin_path}/apm-server" + destination = local.bin_path + } + provisioner "file" { destination = local.conf_path content = templatefile(var.ea_managed ? "${path.module}/elastic-agent.yml.tftpl" : "${path.module}/apm-server.yml.tftpl", { @@ -172,15 +194,24 @@ resource "aws_instance" "apm" { "sudo cp ${local.conf_path} /etc/elastic-agent/elastic-agent.yml", "sudo systemctl start elastic-agent", "sleep 1", - ] : [ - local.instance_standalone_provision_cmd[var.aws_os], - "sudo cp ${local.conf_path} /etc/apm-server/apm-server.yml", - "sudo systemctl start apm-server", - "sleep 1", - ] + ] : ( + var.apm_server_bin_path == "" ? [ + local.instance_standalone_provision_cmd[var.aws_os], + "sudo cp ${local.conf_path} /etc/apm-server/apm-server.yml", + "sudo systemctl start apm-server", + "sleep 1", + ] : [ + "sudo cp ${local.bin_path} apm-server", + "sudo chmod +x apm-server", + "sudo cp ${local.conf_path} apm-server.yml", + "sudo mkdir -m 777 /var/log/apm-server", + "screen -d -m ./apm-server", + "sleep 1" + ] + ) } - vpc_security_group_ids = [aws_security_group.main.id] + tags = var.tags } resource "null_resource" "apm_server_log" { @@ -208,8 +239,8 @@ data "external" "latest_apm_server" { } resource "aws_key_pair" "provisioner_key" { - key_name = var.aws_provisioner_key_name public_key = file("${var.aws_provisioner_key_name}.pub") + tags = var.tags } resource "random_password" "apm_secret_token" { diff --git a/testing/infra/terraform/modules/standalone_apm_server/outputs.tf b/testing/infra/terraform/modules/standalone_apm_server/outputs.tf index f3cda741079..138e496e592 100644 --- a/testing/infra/terraform/modules/standalone_apm_server/outputs.tf +++ b/testing/infra/terraform/modules/standalone_apm_server/outputs.tf @@ -5,6 +5,11 @@ output "apm_secret_token" { } output "apm_server_url" { - value = "${aws_instance.apm.public_ip}:${local.apm_port}" + value = "http://${aws_instance.apm.public_ip}:${local.apm_port}" description = "The APM Server URL" } + +output "apm_server_ip" { + value = aws_instance.apm.public_ip + description = "The APM Server EC2 IP address" +} diff --git a/testing/infra/terraform/modules/standalone_apm_server/provider.tf b/testing/infra/terraform/modules/standalone_apm_server/provider.tf deleted file mode 100644 index 3d860298151..00000000000 --- a/testing/infra/terraform/modules/standalone_apm_server/provider.tf +++ /dev/null @@ -1,6 +0,0 @@ -provider "aws" { - region = var.worker_region - default_tags { - tags = var.tags - } -} diff --git a/testing/infra/terraform/modules/standalone_apm_server/variables.tf b/testing/infra/terraform/modules/standalone_apm_server/variables.tf index d0e11890832..40b999c126c 100644 --- a/testing/infra/terraform/modules/standalone_apm_server/variables.tf +++ b/testing/infra/terraform/modules/standalone_apm_server/variables.tf @@ -1,12 +1,22 @@ variable "aws_os" { default = "" - description = "Optional aws ec2 instance OS" + description = "Optional aws EC2 instance OS" type = string } -variable "aws_provisioner_key_name" { +variable "apm_instance_type" { default = "" - description = "Optional ssh key name to create the aws key pair and remote provision the ec2 instance" + type = string + description = "Optional apm server instance type overide" +} + +variable "vpc_id" { + description = "VPC ID to provision the EC2 instance" + type = string +} + +variable "aws_provisioner_key_name" { + description = "ssh key name to create the aws key pair and remote provision the EC2 instance" type = string } @@ -39,44 +49,20 @@ variable "region" { type = string } -variable "worker_region" { - default = "us-west-2" - description = "Optional AWS region where the workers will be created. Defaults to us-west-2 (AWS)" - type = string -} - variable "ea_managed" { default = false description = "Whether or not install Elastic Agent managed APM Server" type = bool } +variable "apm_server_bin_path" { + default = "" + type = string + description = "Optionally use the apm-server binary from the specified path instead" +} + variable "tags" { type = map(string) default = {} description = "Optional set of tags to use for all deployments" } - -# CI variables -variable "BRANCH" { - description = "Branch name or pull request for tagging purposes" - default = "unknown" -} - -variable "BUILD_ID" { - description = "Build ID in the CI for tagging purposes" - default = "unknown" -} - -variable "CREATED_DATE" { - description = "Creation date in epoch time for tagging purposes" - default = "unknown" -} - -variable "ENVIRONMENT" { - default = "unknown" -} - -variable "REPO" { - default = "unknown" -}