Skip to content

Commit

Permalink
prombench: Added support for --bench.version and `--bench.directory…
Browse files Browse the repository at this point in the history
…` flags.

See prometheus/proposals#41 for rationale.

Prometheus job got updated with prometheus/prometheus#15682

Signed-off-by: bwplotka <bwplotka@gmail.com>
  • Loading branch information
bwplotka committed Dec 17, 2024
1 parent c037ee0 commit 9a66b84
Show file tree
Hide file tree
Showing 11 changed files with 368 additions and 195 deletions.
71 changes: 55 additions & 16 deletions prombench/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
INFRA_CMD ?= ../infra/infra

PROVIDER ?= gke

.PHONY: deploy clean
deploy: node_create resource_apply
# GCP sometimes takes longer than 30 tries when trying to delete nodes
# if k8s resources are not already cleared
clean: resource_delete node_delete
INFRA_CMD ?= ../infra/infra
PROVIDER ?= gke

cluster_create:
${INFRA_CMD} ${PROVIDER} cluster create -a ${AUTH_FILE} \
Expand Down Expand Up @@ -37,50 +30,96 @@ cluster_delete:
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/cluster_${PROVIDER}.yaml

# /prombench <...> --bench.directory
BENCHMARK_DIRECTORY := $(if $(BENCHMARK_DIRECTORY),$(BENCHMARK_DIRECTORY),manifests/prombench)
# /prombench <...> --bench.version
BENCHMARK_VERSION := $(if $(BENCHMARK_VERSION),$(BENCHMARK_VERSION),master)
PROMBENCH_GIT_REPOSITORY ?= git@github.com:prometheus/test-infra.git
PROMBENCH_DIR ?= .

# maybe_pull_custom_version allows custom benchmarking as designed in
# https://github.com/prometheus/proposals/pull/41. It allows calling
# /prombench <release> --bench.version=<@commit or branch> which will cause
# prombench GH job on Prometheus repo to call infra CLI with the non-master BENCHMARK_VERSION.
# In such a case we pull a prombench repository for the given branch or commit version
# and adjust PROMBENCH_DIR. As a result `make deploy` and `make clean` jobs
# will apply /manifests/ apply custom manifests or even node pools.
.PHONE: maybe_pull_custom_version
maybe_pull_custom_version:
ifeq (${BENCHMARK_VERSION},master)
@echo ">> Using standard benchmark configuration, from the docker image"
else
@echo ">> Git pulling custom benchmark configuration from the ${BENCHMARK_VERSION}"
@$(eval $@_TMP_DIR=$(shell mktemp -d -t "prombench"))
cd ${$@_TMP_DIR} && git clone ${PROMBENCH_GIT_REPOSITORY}
ifeq ($(subst @,,${BENCHMARK_VERSION}),${BENCHMARK_VERSION})
@echo ">> --bench.version is a branch, reseting to origin/${BENCHMARK_VERSION}"
cd ${$@_TMP_DIR}/test-infra && git reset --hard origin/${BENCHMARK_VERSION}
else
@echo ">> --bench.version is a commit SHA, reseting to $(subst @,,${BENCHMARK_VERSION})"
cd ${$@_TMP_DIR}/test-infra && git reset --hard $(subst @,,${BENCHMARK_VERSION})
endif
$(eval PROMBENCH_DIR=${$@_TMP_DIR}/test-infra/prombench)
endif
@echo ">> Using following files in ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}"
@ls -lR ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}

.PHONE: clean_tmp_dir
clean_tmp_dir: # Clean after maybe_pull_custom_version
[ -z ${maybe_pull_custom_version_TMP_DIR} ] || rm -rf ${maybe_pull_custom_version_TMP_DIR}

.PHONY: deploy
deploy: maybe_pull_custom_version node_create resource_apply clean_tmp_dir

.PHONE: clean
# GCP sometimes takes longer than 30 tries when trying to delete nodes
# if k8s resources are not already cleared
clean: maybe_pull_custom_version resource_delete node_delete clean_tmp_dir

node_create:
${INFRA_CMD} ${PROVIDER} nodes create -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \
-v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/nodes_${PROVIDER}.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml

resource_apply:
$(INFRA_CMD) ${PROVIDER} resource apply -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v CLUSTER_NAME:${CLUSTER_NAME} \
-v PR_NUMBER:${PR_NUMBER} -v RELEASE:${RELEASE} -v DOMAIN_NAME:${DOMAIN_NAME} \
-v GITHUB_ORG:${GITHUB_ORG} -v GITHUB_REPO:${GITHUB_REPO} \
-f manifests/prombench/benchmark
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark

# Required because namespace and cluster-role are not part of the created nodes
resource_delete:
$(INFRA_CMD) ${PROVIDER} resource delete -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/benchmark/1c_cluster-role-binding.yaml \
-f manifests/prombench/benchmark/1a_namespace.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark/1c_cluster-role-binding.yaml \
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark/1a_namespace.yaml

node_delete:
$(INFRA_CMD) ${PROVIDER} nodes delete -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \
-v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/nodes_${PROVIDER}.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml

all_nodes_running:
$(INFRA_CMD) ${PROVIDER} nodes check-running -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \
-v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} -v SEPARATOR:${SEPARATOR} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/nodes_${PROVIDER}.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml

all_nodes_deleted:
$(INFRA_CMD) ${PROVIDER} nodes check-deleted -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \
-v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} -v SEPARATOR:${SEPARATOR} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/nodes_${PROVIDER}.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml
31 changes: 23 additions & 8 deletions prombench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,24 @@

This setup leverages **GitHub Actions** and **Google Kubernetes Engine (GKE)**, but is designed to be extendable to other Kubernetes providers.

## Overview of Manifest Files
## Configuration Files

The `/manifest` directory contains Kubernetes manifest files:
The `./manifest` directory contains configuration files. We can outline :

- **`cluster_gke.yaml`**: Creates the Main Node in GKE.
- **`cluster_eks.yaml`**: Creates the Main Node in EKS.
- **`cluster-infra/`**: Contains persistent components of the Main Node.
- **`prombench/`**: Resources created and destroyed for each Prombench test.
- **`./manifest/cluster_gke.yaml`**: Creates the Main Node in GKE.
- **`./manifest/cluster_eks.yaml`**: Creates the Main Node in EKS.
- **`./manifest/cluster-infra/`**: Contains persistent components of the Main Node.
- **`./manifest/prombench/`**: Resources created and destroyed for each Prombench test. See [`its README.md`](./manifests/prombench/README.md) for details.

## Setup and Running Prombench
## Prombench Setup

Prombench can be run on different providers. Follow these instructions based on your provider:

- [Google Kubernetes Engine (GKE)](docs/gke.md)
- [Kubernetes In Docker (KIND)](docs/kind.md)
- [Elastic Kubernetes Service (EKS)](docs/eks.md)

## Setting Up GitHub Actions
### Setting Up GitHub Actions

1. Place a workflow file in the `.github` directory of your repository. Refer to the [Prometheus GitHub repository](https://github.com/prometheus/prometheus) for an example.

Expand All @@ -30,22 +30,34 @@ Prombench can be run on different providers. Follow these instructions based on
```bash
cat $AUTH_FILE | base64 -w 0
```

3. Configure webhook to cluster's comment-monitor as described [here](../tools/comment-monitor/README.md#setting-up-the-github-webhook).

## Prombench Usage

### Triggering Tests via GitHub Comment

**Starting Tests:**

- `/prombench main` or `/prombench master` - Compare PR with the main/master branch.
- `/prombench v2.4.0` - Compare PR with a specific release version (e.g., from [quay.io/prometheus/prometheus:releaseVersion](https://quay.io/prometheus/prometheus:releaseVersion)).
- `/prombench v2.4.0 --bench.version=@aca1803ccf5d795eee4b0848707eab26d05965cc` - Compare with 2.4.0 release, but use a specific `aca1803ccf5d795eee4b0848707eab26d05965cc` commit on this repository for `./manifests/prombench` resources.
- `/prombench v2.4.0 --bench.version=mybranch` - Compare with 2.4.0 release, but use a specific `mybranch` on this repository for `./manifests/prombench` resources.
- `/prombench v2.4.0 --bench.directory=manifests/prombench-agent-mode` - Compare with 2.4.0 release, but use a specific resource directory on `master` branch for this repository. Currently there is only `./manifests/prombench` available (default), we might add more modes in the future.

**Restarting Tests:**

- `/prombench restart <release_version>`
- `/prombench restart <release_version> --bench.version=... --bench.directory...`

**Stopping Tests:**

- `/prombench cancel`

**Printing available commands:**

- `/prombench help`

### Building the Docker Image

Build the Docker image with:
Expand All @@ -54,3 +66,6 @@ Build the Docker image with:
docker build -t prominfra/prombench:master .
```




Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,14 @@ data:
* To restart benchmark: `/prombench restart <branch or git tag to compare with>`
* To stop benchmark: `/prombench cancel`
* To print help: `/prombench help`
**Example:** `/prombench v3.0.0`
**Advanced Flags for `start` and `restart` Commands:**:
* `--bench.directory` (default: `manifests/prombench`)
* `--bench.version` (default: `master`)
**Examples:**
* `/prombench v3.0.0`
* `/prombench v3.0.0 --bench.version=@aca1803ccf5d795eee4b0848707eab26d05965cc --bench.directory=manifests/prombench`
verify_user: true
commands:
Expand All @@ -24,7 +30,11 @@ data:
- name: restart
event_type: prombench_restart
args_regex: (?P<RELEASE>master|main|v[0-9]+\.[0-9]+\.[0-9]+\S*)$
arg_regex: (master|main|v[0-9]+\.[0-9]+\.[0-9]+\S*)
arg_name: RELEASE
flag_args:
bench.directory: BENCHMARK_DIRECTORY
bench.version: BENCHMARK_VERSION
comment_template: |
⏱️ Welcome (again) to Prometheus Benchmarking Tool. ⏱️
Expand All @@ -44,7 +54,11 @@ data:
- name: "" # start is a default (empty command).
event_type: prombench_start
args_regex: (?P<RELEASE>master|main|v[0-9]+\.[0-9]+\.[0-9]+\S*)$
arg_regex: (master|main|v[0-9]+\.[0-9]+\.[0-9]+\S*)
arg_name: RELEASE
flag_args:
bench.directory: BENCHMARK_DIRECTORY
bench.version: BENCHMARK_VERSION
label: prombench
comment_template: |
⏱️ Welcome to Prometheus Benchmarking Tool. ⏱️
Expand Down
45 changes: 45 additions & 0 deletions prombench/manifests/prombench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
## Prombench Benchmark Scenario Configuration

This directory contains resources that are applied (and cleaned) on every benchmark request
via `infra` CLI using [`make deploy`](../../Makefile) and cleaned using [`make clean`](../../Makefile).

It assumes running cluster was created via `infra` CLI using `make cluster_create` and `make cluster_delete`.

### Variables

It expects the following templated variables:

* `.PR_NUMBER`: The PR number from which `/prombench` was triggered. This PR number also tells what commit to use for the `prometheus-test-pr-{{ .PR_NUMBER }}` Prometheus image building (in the init container).
* `.RELEASE`: The argument provided by `/prombench` caller representing the Prometheus version (docker image tag for `quay.io/prometheus/prometheus:{{ .RELEASE }}`) to compare with, deployed as the `prometheus-test-{{ .RELEASE }}`.
* `.DOMAIN_NAME`
* `.LOADGEN_SCALE_UP_REPLICAS`
* `.GITHUB_ORG`
* `.GITHUB_REPO`

### Customizations

> NOTE: See https://github.com/prometheus/proposals/pull/41 for design.
On the `master` branch, in this directory, we maintain the standard, single benchmarking scenario used
as an acceptance validation for Prometheus. It's important to ensure it represents common Prometheus configuration.

The only user related parameter for the standard scenario is `RELEASE` version.

However, it's possible to create, a fully custom benchmarking scenarios for `/prombench` via `--bench.version=<branch|@commit>` flag.

Here are an example steps:

1. Create a new branch on https://github.com/prometheus/test-infra e.g. `benchmark/scenario1`.
2. Modify this directory to your liking e.g. changing query load, metric load of advanced Prometheus configuration. It's also possible to make Prometheus deployments and versions exactly the same, but vary in a single configuration flag, for feature benchmarking.

> WARN: When customizing this directory, don't change `1a_namespace.yaml` or `1c_cluster-role-binding.yaml` filenames as they are used for cleanup routine. Or, if you change it, know what you're doing in relation to [`make clean` job](../../Makefile).
3. Push changes to the new branch.
4. From the Prometheus PR comment, call prombench as `/prombench <release> --bench.version=benchmark/scenario1` or `/prombench <release> --bench.version=@<relevant commit SHA from the benchmark/scenario1>` to use configuration files from this custom branch.

Other details:

* Other custom branch modifications other than to this directory do not affect prombench (e.g. to infra CLI or makefiles).
* `--bench.version` is designed for a short-term or even one-off benchmark scenario configurations. It's not designed for long-term, well maintained scenarios. For the latter reason we can later e.g. maintain multiple `manifests/prombench` directories and introduce a new `--bench.directory` flag.
* Non-maintainers can follow similar process, but they will need to ask maintainer for a new branch and PR review. We can consider extending `--bench.version` to support remote repositories if this becomes a problem.
* Custom benchmarking logic is implemented in the [`maybe_pull_custom_version` make job](../../Makefile) and invoked by the prombench GH job on Prometheus repo on `deploy` and `clean`.
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ spec:
name: prometheus-test
- name: instance-ssd
hostPath:
path: /mnt/disks/ssd0 #gke ssds
# /mnt is where GKE keeps it's SSD.
path: /mnt/disks/ssd0
- name: prometheus-executable
emptyDir: {}
terminationGracePeriodSeconds: 300
Expand All @@ -113,91 +114,3 @@ spec:
selector:
app: prometheus
prometheus: test-pr-{{ .PR_NUMBER }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-test-{{ normalise .RELEASE }}
namespace: prombench-{{ .PR_NUMBER }}
labels:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
template:
metadata:
namespace: prombench-{{ .PR_NUMBER }}
labels:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
spec:
serviceAccountName: prometheus
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- topologyKey: kubernetes.io/hostname
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- prometheus
securityContext:
runAsUser: 0
containers:
- name: prometheus
image: quay.io/prometheus/prometheus:{{ .RELEASE }}
imagePullPolicy: Always
command: [ "/bin/prometheus" ]
args: [
"--web.external-url=http://{{ .DOMAIN_NAME }}/{{ .PR_NUMBER }}/prometheus-release",
"--storage.tsdb.path=/prometheus",
"--config.file=/etc/prometheus/prometheus.yml",
"--log.level=debug"
]
resources:
requests:
cpu: 2
memory: 20Gi
volumeMounts:
- name: config-volume
mountPath: /etc/prometheus
- name: instance-ssd
mountPath: /prometheus
ports:
- name: prom-web
containerPort: 9090
volumes:
- name: config-volume
configMap:
name: prometheus-test
- name: instance-ssd
hostPath:
# /mnt is where GKE keeps it's SSD
# don't change this if you want Prometheus to take advantage of these local SSDs
path: /mnt/disks/ssd0
terminationGracePeriodSeconds: 300
nodeSelector:
node-name: prometheus-{{ .PR_NUMBER }}
isolation: prometheus
---
apiVersion: v1
kind: Service
metadata:
name: prometheus-test-{{ normalise .RELEASE }}
namespace: prombench-{{ .PR_NUMBER }}
labels:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
spec:
ports:
- name: prom-web
port: 80
targetPort: prom-web
selector:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
Loading

0 comments on commit 9a66b84

Please sign in to comment.