Skip to content

Commit

Permalink
performance: add a new benchmarks workflow to enable PGO builds (#13884)
Browse files Browse the repository at this point in the history
Add a benchmark workflow mode with automation to collect, preserve, and inject CPU profiles, enabling PGO builds.

The new workflow will run on a schedule and raise a special pull request that includes the most recent representative CPU profile, which will be inserted as the `default.pgo` file into the main package and automatically used in the build pipeline. The actual schedule and the model for raising pull requests with updated profiles are subject to further revisions. This new workflow mode uses a lightweight output destination - a mock proxy (Moxy) from apm-perf to better isolate the performance component of the APM Server.
  • Loading branch information
1pkg authored Oct 3, 2024
1 parent 1bc9e5e commit 5af8cf4
Show file tree
Hide file tree
Showing 36 changed files with 731 additions and 195 deletions.
15 changes: 15 additions & 0 deletions .ci/scripts/push-pgo-pr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

set -eo pipefail

PGO_BRANCH="update-pgo-$(date +%s)"
cd $WORKSPACE_PATH
git fetch origin main
git checkout main
git checkout -b $PGO_BRANCH
mv $PROFILE_PATH x-pack/apm-server/default.pgo
git add x-pack/apm-server/default.pgo
git commit -m "PGO: Update default.pgo from benchmarks $WORKFLOW."
git push -u origin $PGO_BRANCH
gh pr create -B main -H $PGO_BRANCH -t "PGO: Update default.pgo" -b "Update default.pgo CPU profile from the benchmarks [workflow]($WORKFLOW)." -R elastic/apm-server
gh pr merge --auto --delete-branch --squash $PGO_BRANCH
94 changes: 86 additions & 8 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ name: benchmarks
on:
workflow_dispatch:
inputs:
runStandalone:
description: 'Run the benchmarks against standalone APM Server with Moxy'
required: false
type: boolean
default: false
profile:
description: 'The system profile used to run the benchmarks'
required: false
Expand All @@ -21,10 +26,12 @@ on:
required: false
type: string
schedule:
- cron: '0 17 * * *'
- cron: '0 17 * * *' # Scheduled regular benchmarks.
- cron: '0 5 */5 * *' # Scheduled PGO benchmarks.

env:
PNG_REPORT_FILE: out.png
BENCHMARK_CPU_OUT: default.pgo
BENCHMARK_RESULT: benchmark-result.txt
WORKING_DIRECTORY: testing/benchmark

Expand All @@ -38,12 +45,13 @@ jobs:
run:
working-directory: ${{ env.WORKING_DIRECTORY }}
permissions:
contents: read
contents: write
id-token: write
env:
SSH_KEY: ./id_rsa_terraform
TF_VAR_private_key: ./id_rsa_terraform
TF_VAR_public_key: ./id_rsa_terraform.pub
RUN_STANDALONE: ${{ inputs.runStandalone || github.event.schedule=='0 5 */5 * *' }}
TFVARS_SOURCE: ${{ inputs.profile || 'system-profiles/8GBx1zone.tfvars' }} # // Default to use an 8gb profile
TF_VAR_BUILD_ID: ${{ github.run_id }}
TF_VAR_ENVIRONMENT: ci
Expand Down Expand Up @@ -101,28 +109,48 @@ jobs:
terraform_version: 1.3.7
terraform_wrapper: false

- name: Init terraform module
id: init
run: make init

- name: Build apmbench
run: make apmbench $SSH_KEY terraform.tfvars

- name: Build APM Server and Moxy
if: ${{ env.RUN_STANDALONE == 'true' }}
run: |
make apm-server
make moxy
- name: Override docker committed version
if: ${{ ! inputs.runOnStable }}
if: ${{ ! inputs.runOnStable && env.RUN_STANDALONE == 'false' }}
run: make docker-override-committed-version

- name: Spin up benchmark environment
id: deploy
run: |
make init apply
make apply
admin_console_url=$(terraform output -raw admin_console_url)
echo "admin_console_url=$admin_console_url" >> "$GITHUB_OUTPUT"
echo "-> infra setup done"
env:
TF_VAR_worker_region: ${{ env.AWS_REGION }}
TF_VAR_run_standalone: ${{ env.RUN_STANDALONE }}

- name: Run benchmarks autotuned
if: ${{ inputs.benchmarkAgents == '' }}
run: make run-benchmark-autotuned index-benchmark-results
run: make run-benchmark-autotuned

- name: Run benchmarks self tuned
if: ${{ inputs.benchmarkAgents != '' }}
run: make run-benchmark index-benchmark-results
run: make run-benchmark

- name: Cat standalone server logs
if: ${{ env.RUN_STANDALONE == 'true' && failure() }}
run: make cat-apm-server-logs

- name: Index benchmarks result
run: make index-benchmark-results

- name: Download PNG
run: >-
Expand Down Expand Up @@ -150,15 +178,65 @@ jobs:
- name: Upload benchmark result
uses: actions/upload-artifact@v4
if: always()
with:
name: benchmark-result
path: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_RESULT }}
if-no-files-found: error

# The next section injects CPU profile collected by apmbench into the build.
# By copying the profile, uploading it to the artifacts and pushing it
# via a PR to update default.pgo.

- name: Copy CPU profile
run: make cp-cpuprof

- name: Upload CPU profile
uses: actions/upload-artifact@v4
with:
name: cpu-profile
path: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_CPU_OUT }}
if-no-files-found: error

- name: Get token
id: get_token
uses: tibdex/github-app-token@3beb63f4bd073e61482598c45c71c1019b59b73a # v2.1.0
with:
app_id: ${{ secrets.OBS_AUTOMATION_APP_ID }}
private_key: ${{ secrets.OBS_AUTOMATION_APP_PEM }}
permissions: >-
{
"contents": "write",
"pull_requests": "write"
}
# Required to use a service account, otherwise PRs created by
# GitHub bot won't trigger any CI builds.
# See https://github.com/peter-evans/create-pull-request/issues/48#issuecomment-537478081
- name: Configure git user
uses: elastic/oblt-actions/git/setup@v1
with:
github-token: ${{ steps.get_token.outputs.token }}

- name: Import GPG key
uses: crazy-max/ghaction-import-gpg@01dd5d3ca463c7f10f7f4f7b4f177225ac661ee4 # v6.1.0
with:
gpg_private_key: ${{ secrets.APM_SERVER_RELEASE_GPG_PRIVATE_KEY }}
passphrase: ${{ secrets.APM_SERVER_RELEASE_PASSPHRASE }}
git_user_signingkey: true
git_commit_gpgsign: true

- name: Open PGO PR
if: ${{ env.RUN_STANDALONE == 'true' && github.ref == 'refs/heads/main' }}
run: ${{ github.workspace }}/.ci/scripts/push-pgo-pr.sh
env:
WORKSPACE_PATH: ${{ github.workspace }}
PROFILE_PATH: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_CPU_OUT }}
GITHUB_TOKEN: ${{ steps.get_token.outputs.token }}
WORKFLOW: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }}

- name: Tear down benchmark environment
if: always()
run: make destroy
run: make init destroy

# Notify failure to Slack only on schedule (nightly run)
- if: failure() && github.event_name == 'schedule'
Expand Down
2 changes: 1 addition & 1 deletion systemtest/benchtest/profiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func (p *profiles) recordCPU() error {
if benchConfig.CPUProfile == "" {
return nil
}
duration := 2 * benchConfig.Benchtime
duration := benchConfig.Benchtime
profile, err := fetchProfile("/debug/pprof/profile", duration)
if err != nil {
return fmt.Errorf("failed to fetch CPU profile: %w", err)
Expand Down
31 changes: 30 additions & 1 deletion testing/benchmark/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@ APMBENCH_PATH ?= ../../systemtest/cmd/apmbench
APMBENCH_GOOS ?= linux
APMBENCH_GOARCH ?= amd64

MOXY_GOOS ?= linux
MOXY_GOARCH ?= amd64

APM_SERVER_GOOS ?= linux
APM_SERVER_GOARCH ?= amd64

TFVARS_SOURCE ?= terraform.tfvars.example

BENCHMARK_WARMUP_TIME ?= 5m
Expand All @@ -23,6 +29,8 @@ SSH_USER ?= ec2-user
SSH_OPTS ?= -o LogLevel=ERROR -o StrictHostKeyChecking=no -o ServerAliveInterval=60 -o ServerAliveCountMax=10
SSH_KEY ?= ~/.ssh/id_rsa_terraform
WORKER_IP = $(shell terraform output -raw public_ip)
APM_SERVER_IP = $(shell terraform output -raw apm_server_ip)
RUN_STANDALONE = $(shell echo var.run_standalone | terraform console | tr -d '"')

SHELL = /bin/bash
.SHELLFLAGS = -o pipefail -c
Expand Down Expand Up @@ -67,6 +75,15 @@ apmbench:
@echo "-> Building apmbench..."
@cd $(APMBENCH_PATH) && CGO_ENABLED=0 GOOS=$(APMBENCH_GOOS) GOARCH=$(APMBENCH_GOARCH) go build .

.PHONY: moxy
moxy:
@echo "-> Building moxy..."
@cd ../../tools && CGO_ENABLED=0 GOOS=$(MOXY_GOOS) GOARCH=$(MOXY_GOARCH) go build -o "../build" github.com/elastic/apm-perf/cmd/moxy

.PHONY: apm-server
apm-server:
@cd ../.. && make build/apm-server-$(APM_SERVER_GOOS)-$(APM_SERVER_GOARCH) && mv build/apm-server-$(APM_SERVER_GOOS)-$(APM_SERVER_GOARCH) build/apm-server

.PHONY: init
init:
@terraform init
Expand Down Expand Up @@ -110,13 +127,25 @@ index-benchmark-results: _default-gobench-vars

.PHONY: _default-gobench-vars
_default-gobench-vars:
ifeq ($(RUN_STANDALONE),true)
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_size=$(shell echo var.standalone_apm_server_instance_size | terraform console | tr -d '"'))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),moxy_size=$(shell echo var.standalone_moxy_instance_size | terraform console | tr -d '"'))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),build_sha=$(shell git rev-parse HEAD))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),bench_mode=standalone)
else
# TODO(marclop) Update code below to use a foor loop, rather than copying the lines.
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_size=$(shell echo var.apm_server_size | terraform console | tr -d '"'))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),elasticsearch_size=$(shell echo var.elasticsearch_size | terraform console | tr -d '"'))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),stack_version=$(shell echo var.stack_version | terraform console | tr -d '"'))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_zone_count=$(shell echo var.apm_server_zone_count | terraform console | tr -d '"'))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),elasticsearch_zone_count=$(shell echo var.elasticsearch_zone_count | terraform console | tr -d '"'))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),build_sha=$(shell curl -sL -H "Authorization: Bearer $(shell terraform output -raw apm_secret_token )" $(shell terraform output -raw apm_server_url ) | jq -r '.build_sha'))
$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),bench_mode=cloud)
endif

.PHONY: cat-apm-server-logs
cat-apm-server-logs:
@ssh $(SSH_OPTS) -i $(SSH_KEY) $(SSH_USER)@$(APM_SERVER_IP) "cat /var/log/apm-server/*"

$(SSH_KEY):
@ssh-keygen -t rsa -b 4096 -C "$(USER)@elastic.co" -N "" -f $(SSH_KEY)
Expand Down Expand Up @@ -172,4 +201,4 @@ elastic_agent_docker_image: build_elastic_agent_docker_image
build_elastic_agent_docker_image:
@env BASE_IMAGE=${ELASTIC_AGENT_DOCKER_IMAGE}:${ELASTIC_AGENT_IMAGE_TAG} GOARCH=amd64 \
bash ${REPO_ROOT}/testing/docker/elastic-agent/build.sh \
-t ${CI_ELASTIC_AGENT_DOCKER_IMAGE}:${CUSTOM_IMAGE_TAG}
-t ${CI_ELASTIC_AGENT_DOCKER_IMAGE}:${CUSTOM_IMAGE_TAG}
2 changes: 1 addition & 1 deletion testing/benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ overridden automatically, you need to remove it manually if present.
#### Override docker image tag

It is possible to override the tag of the docker image that is run in the remote ESS deployment. You can
specify any of the avilable tags (such as `8.3.0-SNAPSHOT` or a more specific tag `8.3.0-c655cda8-SNAPSHOT`).
specify any of the available tags (such as `8.3.0-SNAPSHOT` or a more specific tag `8.3.0-c655cda8-SNAPSHOT`).
Alternatively, you can run `make docker-override-committed-version` in your shell, to have use the committed
tags in the `docker-compose.yml` file in the repository root.

Expand Down
84 changes: 80 additions & 4 deletions testing/benchmark/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,46 @@ locals {
name_prefix = "${coalesce(var.user_name, "unknown-user")}-bench"
}

module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "3.14.0"

name = "${var.user_name}-worker"
cidr = var.vpc_cidr

azs = ["${var.worker_region}a"]
public_subnets = var.public_cidr
enable_ipv6 = false
enable_nat_gateway = false
single_nat_gateway = false

manage_default_security_group = true
default_security_group_ingress = [
{
"from_port" : 0,
"to_port" : 0,
"protocol" : -1,
"self" : true,
"cidr_blocks" : "0.0.0.0/0",
}
]
default_security_group_egress = [
{
"from_port" : 0,
"to_port" : 0,
"protocol" : -1,
"cidr_blocks" : "0.0.0.0/0",
}
]

tags = merge(local.ci_tags, module.tags.tags)
vpc_tags = {
Name = "vpc-${var.user_name}-worker"
}
}

module "ec_deployment" {
count = var.run_standalone ? 0 : 1
source = "../infra/terraform/modules/ec_deployment"

region = var.ess_region
Expand Down Expand Up @@ -73,18 +112,55 @@ module "ec_deployment" {

module "benchmark_worker" {
source = "../infra/terraform/modules/benchmark_executor"
region = var.worker_region

vpc_id = module.vpc.vpc_id
region = var.worker_region
user_name = var.user_name

apm_server_url = module.ec_deployment.apm_url
apm_secret_token = module.ec_deployment.apm_secret_token
apm_server_url = var.run_standalone ? module.standalone_apm_server[0].apm_server_url : module.ec_deployment[0].apm_url
apm_secret_token = var.run_standalone ? module.standalone_apm_server[0].apm_secret_token : module.ec_deployment[0].apm_secret_token

apmbench_bin_path = var.apmbench_bin_path
instance_type = var.worker_instance_type

public_key = var.public_key
private_key = var.private_key

tags = merge(local.ci_tags, module.tags.tags)
tags = merge(local.ci_tags, module.tags.tags)
depends_on = [module.standalone_apm_server, module.ec_deployment]
}

module "moxy" {
count = var.run_standalone ? 1 : 0
source = "../infra/terraform/modules/moxy"

vpc_id = module.vpc.vpc_id
instance_type = var.standalone_moxy_instance_size
moxy_bin_path = var.moxy_bin_path

aws_provisioner_key_name = var.private_key

tags = merge(local.ci_tags, module.tags.tags)
depends_on = [module.vpc]
}


module "standalone_apm_server" {
count = var.run_standalone ? 1 : 0
source = "../infra/terraform/modules/standalone_apm_server"

vpc_id = module.vpc.vpc_id
aws_os = "amzn2-ami-hvm-*-x86_64-ebs"
apm_instance_type = var.standalone_apm_server_instance_size
apm_server_bin_path = var.apm_server_bin_path
ea_managed = false

aws_provisioner_key_name = var.private_key

elasticsearch_url = module.moxy[0].moxy_url
elasticsearch_username = "elastic"
elasticsearch_password = module.moxy[0].moxy_password

tags = merge(local.ci_tags, module.tags.tags)
depends_on = [module.moxy]
}
Loading

0 comments on commit 5af8cf4

Please sign in to comment.