Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add datasheet for 8cpu16gb #84

Merged
merged 26 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/load/gcp/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ terraform.tfstate*
.terraform*
node_modules/*
var_grafana_cloud.tfvars
venv/
22 changes: 13 additions & 9 deletions tests/load/gcp/cos-lite.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ data "cloudinit_config" "cos_lite" {
"content" : file("common/node-exporter.service"),
},
{
"path" : "/run/wait-for-prom-ready.sh",
"path" : "/var/wait-for-prom-ready.sh",
"permissions" : "0755",
"content" : templatefile("common/wait-for-prom-ready.tpl.sh", {
PROM_EXTERNAL_URL = local.prom_url,
}),
},
{
"path" : "/run/wait-for-grafana-ready.sh",
"path" : "/var/wait-for-grafana-ready.sh",
"permissions" : "0755",
"content" : templatefile("cos-lite/wait-for-grafana-ready.tpl.sh", {
GRAFANA_EXTERNAL_URL = local.grafana_url,
Expand All @@ -48,7 +48,7 @@ data "cloudinit_config" "cos_lite" {
}),
},
{
"path" : "/run/pod_top_exporter.py",
"path" : "/var/pod_top_exporter.py",
"permissions" : "0755",
"content" : templatefile("cos-lite/pod_top_exporter.tpl.py", {
JUJU_MODEL_NAME = var.juju_model_name,
Expand All @@ -59,7 +59,8 @@ data "cloudinit_config" "cos_lite" {
"content" : file("cos-lite/pod-top-exporter.service"),
},
{
"path" : "/run/overlay-load-test.yaml",
# Path must be inside $HOME because juju 3 is a strictly confined snap
"path" : "/home/ubuntu/overlay-load-test.yaml",
"content" : templatefile("cos-lite/overlay-load-test.tpl.yaml", {
COS_APPLIANCE_HOSTNAME = local.cos_appliance_hostname,
NUM_TARGETS = var.num_avalanche_targets,
Expand All @@ -68,7 +69,7 @@ data "cloudinit_config" "cos_lite" {
}),
},
{
"path" : "/run/cos-lite-rest-server.py",
"path" : "/var/cos-lite-rest-server.py",
"permissions" : "0755",
"content" : file("cos-lite/cos-lite-rest-server.py"),
},
Expand All @@ -79,6 +80,8 @@ data "cloudinit_config" "cos_lite" {
],

"package_update" : "true",
"package_upgrade": "true",
"package_reboot_if_required": "true",

"packages" : [
"python3-pip",
Expand All @@ -94,9 +97,11 @@ data "cloudinit_config" "cos_lite" {

"snap" : {
"commands" : [
"snap install --classic juju --channel=2.9/stable",
"snap install --classic microk8s --channel=1.26/stable",
"snap install --classic juju --channel=3.1/stable",
"snap install --classic microk8s --channel=1.27-strict/stable",
"snap alias microk8s.kubectl kubectl",
"snap alias microk8s.kubectl k",
"snap install yq",
"snap refresh",
]
}
Expand Down Expand Up @@ -126,7 +131,7 @@ resource "google_compute_instance" "vm_cos_lite_appliance" {
initialize_params {
image = "ubuntu-os-cloud/ubuntu-minimal-2204-lts"
type = var.disk_type
size = "100"
size = "200"
}
}

Expand All @@ -141,4 +146,3 @@ resource "google_compute_instance" "vm_cos_lite_appliance" {
}
}
}

4 changes: 2 additions & 2 deletions tests/load/gcp/cos-lite/cos-lite-rest-server.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
@functools.lru_cache
def get_admin_password():
action_result = subprocess.Popen(
["juju", "run-action", "grafana/0", "get-admin-password", "--wait"],
["juju", "run", "grafana/0", "get-admin-password", "--format=yaml"],
stdout=subprocess.PIPE,
user="ubuntu",
)
as_dict = yaml.safe_load(action_result.stdout.read().decode())
return as_dict["unit-grafana-0"]["results"]["admin-password"]
return as_dict["grafana/0"]["results"]["admin-password"]
2 changes: 1 addition & 1 deletion tests/load/gcp/cos-lite/cos-lite-rest-server.service
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ After=network.target multi-user.target
Type=simple
Restart=always
RestartSec=5
Environment="FLASK_APP=/run/cos-lite-rest-server.py"
Environment="FLASK_APP=/var/cos-lite-rest-server.py"
ExecStart=/usr/bin/flask run -p 8081 --host 0.0.0.0
9 changes: 8 additions & 1 deletion tests/load/gcp/cos-lite/overlay-load-test.tpl.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
--- # overlay.yaml
applications:
grafana:
# We need a 9.2 image until the load test is fixed to work with v10.
resources:
grafana-image: docker.io/ubuntu/grafana:9.2-22.04_beta
loki:
options:
ingestion-rate-mb: 60
ingestion-burst-size-mb: 100
resources:
loki-image: grafana/loki:2.4.1
loki-image: ubuntu/loki:2.9.2-22.04
prometheus:
storage:
database: 1000GiB
Expand Down
4 changes: 2 additions & 2 deletions tests/load/gcp/cos-lite/pod-top-exporter.service
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ After=network.target
Type=simple
Restart=always
RestartSec=5
ExecStartPre=/run/wait-for-prom-ready.sh
Environment=FLASK_APP=/run/pod_top_exporter.py
ExecStartPre=/var/wait-for-prom-ready.sh
Environment=FLASK_APP=/var/pod_top_exporter.py
ExecStart=flask run -p 29101 --host 0.0.0.0
74 changes: 68 additions & 6 deletions tests/load/gcp/cos-lite/pod_top_exporter.tpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@

# For memoization
prev_time = None
prev_data = {}
prev_top_pod_data = {}
prev_restart_count_data = {}


def get_top_pod() -> dict:
global prev_time, prev_data
global prev_time, prev_top_pod_data
now = datetime.now()
if prev_time and (now - prev_time).total_seconds() < 15:
return prev_data
if prev_top_pod_data and prev_time and (now - prev_time).total_seconds() < 15:
return prev_top_pod_data

# Going through `current` directly because microk8s-kubectl.wrapper creates subprocesses which
# expect a login session
Expand All @@ -40,11 +41,66 @@ def get_top_pod() -> dict:
# {'alertmanager-0': {'cpu': Decimal('0.056'), 'mem': Decimal('55574528.000')}, ...}

prev_time = now
prev_data = as_dict
prev_top_pod_data = as_dict
# print(as_dict)
return as_dict


def get_restart_count() -> Dict[str, Dict[str, int]]:
"""Get the restart count per container per pod."""
global prev_time, prev_top_pod_data
now = datetime.now()
if prev_restart_count_data and prev_time and (now - prev_time).total_seconds() < 15:
return prev_top_pod_data

# Going through `current` directly because microk8s-kubectl.wrapper creates subprocesses which
# expect a login session
jsnopath_expr = r'{range .items[*]}{.metadata.name}{range .status.containerStatuses[*]}{","}{.name}{","}{.restartCount}{end}{"\n"}{end}'
cmd = [
"/snap/microk8s/current/kubectl",
"--kubeconfig",
"/var/snap/microk8s/current/credentials/client.config",
"-n",
"${JUJU_MODEL_NAME}",
"get",
"pod",
f"-o=jsonpath={jsnopath_expr}",
]
try:
result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
print(e.stdout.decode())
return {}

output = result.stdout.decode("utf-8").strip()
# Output looks like this:
# modeloperator-86c5cfd684-7c2cb,juju-operator,0
# traefik-0,charm,0,traefik,0
# alertmanager-0,alertmanager,0,charm,0
# loki-0,charm,1,loki,1
# scrape-target-0,charm,1
# prometheus-0,charm,1,prometheus,0
# catalogue-0,catalogue,0,charm,1
# cos-config-0,charm,1,git-sync,1
# grafana-0,charm,0,grafana,1,litestream,1
# scrape-config-0,charm,1

restart_counts = {}
for line in output.splitlines():
# Each line is made up of pod name and pairs of container name and restart count.
# The length may change, depending on the number of containers in the pod.
pod_name = line.split(",")[0]
pairs = line.split(",")[1:]
# Convert list to dict (https://stackoverflow.com/a/12739974/3516684)
it = iter(pairs)
pod_restart_counts = dict(zip(it, it))
# Convert values from str to int:
pod_restart_counts = {k: int(v) for k, v in pod_restart_counts.items()}
restart_counts[pod_name] = pod_restart_counts

return restart_counts


class GaugeFamily:
def __init__(self, name: str, help: str):
self.name = name
Expand Down Expand Up @@ -77,5 +133,11 @@ def metrics():
cpu.add({"name": name}, resources["cpu"])
mem.add({"name": name}, resources["mem"])

output = str(cpu) + str(mem)
# TODO the restart count should be a counter type, not a gauge.
pod_restart_count = GaugeFamily("pod_restart_count", "Pod restart count")
for pod_name, containers in get_restart_count().items():
for container_name, restart_count in containers.items():
pod_restart_count.add({"pod_name": pod_name, "container_name": container_name}, restart_count)

output = str(cpu) + str(mem) + str(pod_restart_count)
return output
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ After=network.target
Type=simple
Restart=always
RestartSec=5
ExecStartPre=/run/wait-for-prom-ready.sh
ExecStartPre=/var/wait-for-prom-ready.sh
# Going through `current` directly because microk8s-kubectl.wrapper creates subprocesses which expect a login session
ExecStart=/snap/microk8s/current/kubectl --kubeconfig /var/snap/microk8s/current/credentials/client.config logs prometheus-0 prometheus -n ${JUJU_MODEL_NAME} --follow
27 changes: 17 additions & 10 deletions tests/load/gcp/cos-lite/runcmd.tpl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ systemctl restart sysstat sysstat-collect.timer sysstat-summary.timer
systemctl start node-exporter.service

# setup microk8s and bootstrap
adduser ubuntu microk8s
usermod -a -G snap_microk8s ubuntu
microk8s status --wait-ready
microk8s enable dns:$(grep nameserver /run/systemd/resolve/resolv.conf | awk '{print $2}')
microk8s.enable hostpath-storage
Expand All @@ -42,14 +42,24 @@ microk8s.kubectl rollout status deployment.apps/metrics-server -n kube-system -w
# The connection to the server 127.0.0.1:16443 was refused - did you specify the right host or port?
# the metallb addon must be enabled only after the dns addon was rolled out
# https://github.com/ubuntu/microk8s/issues/2770#issuecomment-984346287
IPADDR=$(ip -4 -j route | jq -r '.[] | select(.dst | contains("default")) | .prefsrc')
IPADDR=$(ip -4 -j route get 2.2.2.2 | jq -r '.[] | .prefsrc')
microk8s.enable metallb:$IPADDR-$IPADDR
microk8s.kubectl rollout status daemonset.apps/speaker -n metallb-system -w --timeout=600s

# prep juju
sudo -u ubuntu juju bootstrap --no-gui --agent-version=2.9.34 microk8s uk8s
# For some reason, in the minimal cloud image, the group and owner of the ubuntu user is root
# TODO: try to add "owner" and "defer" to "write_files" to prevent from cloud init to set $HOME to
# be owned by root:root.
# https://canonical-cloud-init.readthedocs-hosted.com/en/latest/reference/modules.html#write-files
chgrp -R ubuntu /home/ubuntu
chown -R ubuntu /home/ubuntu

# https://bugs.launchpad.net/juju/+bug/1988355
sudo -u ubuntu mkdir -p /home/ubuntu/.local/share/juju
sudo -u ubuntu juju bootstrap --agent-version=3.1.6 microk8s uk8s
sudo -u ubuntu juju add-model --config logging-config="<root>=WARNING; unit=DEBUG" --config update-status-hook-interval="5m" ${JUJU_MODEL_NAME}
sudo -u ubuntu juju deploy --channel=edge cos-lite --trust --overlay /run/overlay-load-test.yaml --trust
sudo -u ubuntu juju deploy --channel=edge cos-lite --trust --overlay /home/ubuntu/overlay-load-test.yaml --trust



# start services
Expand All @@ -59,19 +69,16 @@ sudo -u ubuntu juju deploy --channel=edge cos-lite --trust --overlay /run/overla
# Error from server (NotFound): statefulsets.apps "grafana" not found
sleep 120
microk8s.kubectl rollout status statefulset.apps/grafana -n ${JUJU_MODEL_NAME} -w --timeout=600s
sudo -u ubuntu juju remove-relation grafana:ingress traefik
sleep 30
sudo -u ubuntu juju relate grafana:ingress traefik

# wait for grafana to become active
/run/wait-for-grafana-ready.sh
/var/wait-for-grafana-ready.sh
systemctl start cos-lite-rest-server.service

# force reldata reinit in case files appeared on disk after the last hook fired
sudo -u ubuntu juju run-action cos-config/0 sync-now --wait
sudo -u ubuntu juju run cos-config/0 sync-now

# Waiting for prom here because systemd would timeout waiting for the unit to become active/idle:
# Job for prometheus-stdout-logger.service failed because a timeout was exceeded.
/run/wait-for-prom-ready.sh
/var/wait-for-prom-ready.sh
systemctl start prometheus-stdout-logger.service
systemctl start pod-top-exporter.service
2 changes: 1 addition & 1 deletion tests/load/gcp/cos-lite/wait-for-grafana-ready.tpl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ set -eu

READY=0
until [ $READY -eq 1 ]; do
READY=$(curl -s --connect-timeout 2 --max-time 5 ${GRAFANA_EXTERNAL_URL}/-/ready | grep -F 'Found' | wc -l)
READY=$(curl -s --connect-timeout 2 --max-time 5 ${GRAFANA_EXTERNAL_URL}/api/health | grep -F 'version' | wc -l)
# READY=$(juju status grafana --format=json | jq -r '.applications.grafana."application-status".current' | grep -F 'active' | wc -l)
sleep 5
done
52 changes: 52 additions & 0 deletions tests/load/gcp/load-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# This cloud-init script can we used with multipass like so:
# multipass launch jammy --cloud-init ./load-test.yaml \
# --name load-test --memory 7G --cpus 2 --disk 50G \
# --mount ~/code:/home/ubuntu/code

package_update: true

packages:
- python3-pip
- jq
- zsh
- fzf
- tox
- kitty-terminfo

snap:
commands:
- snap install yq
- snap install --classic google-cloud-sdk
- snap install --classic terraform
- snap refresh

runcmd:
- DEBIAN_FRONTEND=noninteractive apt-get remove -y landscape-client landscape-common adwaita-icon-theme humanity-icon-theme
- DEBIAN_FRONTEND=noninteractive apt-get -y upgrade
- DEBIAN_FRONTEND=noninteractive apt-get -y autoremove

- |
# disable swap
sysctl -w vm.swappiness=0
echo "vm.swappiness = 0" | tee -a /etc/sysctl.conf
swapoff -a

- |
# disable unnecessary services
systemctl disable man-db.timer man-db.service --now
systemctl disable apport.service apport-autoreport.service --now
systemctl disable apt-daily.service apt-daily.timer --now
systemctl disable apt-daily-upgrade.service apt-daily-upgrade.timer --now
systemctl disable unattended-upgrades.service --now
systemctl disable motd-news.service motd-news.timer --now
systemctl disable bluetooth.target --now
systemctl disable ua-messaging.service ua-messaging.timer --now
systemctl disable ua-timer.timer ua-timer.service --now
systemctl disable systemd-tmpfiles-clean.timer --now

- |
# oh-my-zsh + juju plugin
sudo -u ubuntu sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
sudo -u ubuntu git clone https://github.com/zsh-users/zsh-autosuggestions.git ~ubuntu/.oh-my-zsh/custom/plugins/zsh-autosuggestions
sudo -u ubuntu git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ~ubuntu/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting
sudo -u ubuntu sed -i 's/plugins=(git)/plugins=(fzf git zsh-autosuggestions zsh-syntax-highlighting terraform colored-man-pages colorize)/g' ~ubuntu/.zshrc
5 changes: 4 additions & 1 deletion tests/load/gcp/loki-log.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ data "cloudinit_config" "loki_log" {
],

"package_update" : "true",
"package_upgrade": "true",
"package_reboot_if_required": "true",

"packages" : [
"python3-pip",
Expand All @@ -61,7 +63,8 @@ resource "google_compute_instance" "vm_loki_log" {

name = "loki-log"

machine_type = "custom-4-4096"
machine_type = "custom-6-5632"
allow_stopping_for_update = true

tags = ["load-test-traffic", "vm-loki-log"]

Expand Down
2 changes: 1 addition & 1 deletion tests/load/gcp/loki-log/runcmd.tpl.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
set -eux

# install deps
python3 -m pip install locust logfmter Faker
python3 -m pip install locust logfmter Faker charset-normalizer

# wait until the cos-lite node is up
timeout 1800 bash -c "until curl -s --connect-timeout 2.0 --max-time 5 ${LOKI_URL}/ready; do sleep 5; done"
Expand Down
Loading
Loading