Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ebpf collector #156

Merged
merged 18 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@ jobs:
steps:
- prometheus/setup_environment
- run: go mod download
- run: GOARCH=1 make clang
- run: make
- run: CGO_BUILD=1 make
test-arm:
executor: arm
steps:
- checkout
- run: uname -a
- run: GOARCH=1 make clang
- run: make
- run: CGO_BUILD=1 make
build:
Expand Down
51 changes: 51 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveBitFields: false
AlignConsecutiveDeclarations: false
AlignConsecutiveMacros: true
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortEnumsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
BasedOnStyle: LLVM
BraceWrapping:
AfterControlStatement: false
AfterEnum: false
AfterFunction: true
AfterStruct: false
AfterUnion: false
BeforeElse: false
IndentBraces: false
BreakBeforeBraces: Custom
ColumnLimit: 0
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 8
Cpp11BracedListStyle: false
DerivePointerAlignment: false
IndentCaseLabels: false
IndentPPDirectives: None
IndentWidth: 8
IndentWrappedFunctionNames: false
PointerAlignment: Right
ReflowComments: false
SortIncludes: false
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
TabWidth: 8
UseTab: Always
2 changes: 2 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ jobs:
shell: bash
if: ${{ matrix.language == 'go' }}
run: |
echo 'Installing clang 18'
GOARCH=1 make clang
echo 'Building pure go binaries'
make build
echo 'Building cgo binaries'
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/step_tests-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ jobs:
with:
go-version: 1.22.x

- name: Setup clang 18
run: ./scripts/install_clang.sh

- name: Run e2e tests for Go packages
run: make test-e2e

Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/step_tests-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ jobs:
with:
go-version: 1.22.x

- name: Create a sample object file
run: |
mkdir -p pkg/collector/bpf/objs
touch pkg/collector/bpf/objs/sample.o

- name: Lint
uses: golangci/golangci-lint-action@v6
with:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/step_tests-unit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ jobs:
with:
go-version: 1.22.x

- name: Setup clang 18
run: ./scripts/install_clang.sh

- name: Run checkmetrics and checkrules
run: make checkmetrics checkrules

Expand Down
14 changes: 11 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ else
test-docker := test-docker
endif

# Base test flags
test-flags := -covermode=atomic -race

# Use CGO for api and GO for ceems_exporter.
PROMU_TEST_CONF ?= .promu-go-test.yml
ifeq ($(CGO_BUILD), 1)
Expand Down Expand Up @@ -67,8 +70,13 @@ else

# go test flags
coverage-file := coverage-go.out

# If running in CI add -exec sudo flags to run tests that require privileges
ifeq ($(CI), true)
test-flags := $(test-flags) -exec sudo
endif
endif
test-flags := -covermode=atomic -coverprofile=$(coverage-file).tmp -race
test-flags := $(test-flags) -coverprofile=$(coverage-file).tmp

ifeq ($(GOHOSTOS), linux)
test-e2e := test-e2e
Expand Down Expand Up @@ -109,13 +117,13 @@ coverage:
$(GO) tool cover -func=coverage.out -o=coverage.out

.PHONY: test
test: pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked
test: pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked bpf
@echo ">> running tests"
$(GO) test -short $(test-flags) $(pkgs)
cat $(coverage-file).tmp | grep -v "main.go" > $(coverage-file)

.PHONY: test-32bit
test-32bit: pkg/collector/testdata/sys/.unpacked
test-32bit: pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked bpf
@echo ">> running tests in 32-bit mode"
@env GOARCH=$(GOARCH_CROSS) $(GO) test $(pkgs)

Expand Down
38 changes: 36 additions & 2 deletions Makefile.common
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ PROMU := $(FIRST_GOPATH)/bin/promu
SWAG := $(FIRST_GOPATH)/bin/swag
pkgs = ./...

# clang format
FORMAT_FIND_FLAGS ?= -name '*.c' -o -name '*.h' -not -path 'pkg/collector/bpf/include/vmlinux.h' -not -path 'pkg/collector/bpf/libbpf/*'

ifeq (arm, $(GOHOSTARCH))
GOHOSTARM ?= $(shell GOARM= $(GO) env GOARM)
GO_BUILD_PLATFORM ?= $(GOHOSTOS)-$(GOHOSTARCH)v$(GOHOSTARM)
Expand All @@ -48,7 +51,7 @@ PROMU_URL := https://github.com/prometheus/promu/releases/download/v$(PROMU_
SKIP_GOLANGCI_LINT :=
GOLANGCI_LINT :=
GOLANGCI_LINT_OPTS ?=
GOLANGCI_LINT_VERSION ?= v1.54.2
GOLANGCI_LINT_VERSION ?= v1.60.3
# golangci-lint only supports linux, darwin and windows platforms on i386/amd64.
# windows isn't included here because of the path separator being different.
ifeq ($(GOHOSTOS),$(filter $(GOHOSTOS),linux darwin))
Expand Down Expand Up @@ -172,6 +175,10 @@ else
yamllint .
endif

.PHONY: common-clang-format
clang-format: ## Run code formatter on BPF code.
find pkg/collector/bpf $(FORMAT_FIND_FLAGS) | xargs -n 1000 clang-format -i -style=file

# For backward-compatibility.
.PHONY: common-staticcheck
common-staticcheck: lint
Expand All @@ -184,7 +191,7 @@ common-unused:

# Dont bother updating swagger docs for release builds
.PHONY: common-build
common-build: promu swag
common-build: promu swag bpf
ifeq ($(RELEASE_BUILD), 0)
ifeq ($(CGO_BUILD), 1)
@echo ">> updating swagger docs"
Expand Down Expand Up @@ -246,6 +253,33 @@ $(PROMU):
cp $(PROMU_TMP)/promu-$(PROMU_VERSION).$(GO_BUILD_PLATFORM)/promu $(FIRST_GOPATH)/bin/promu
rm -r $(PROMU_TMP)

# Build bpf assets
.PHONY: bpf
# Build bpf assets only when CGO_BUILD=0
ifeq ($(CGO_BUILD), 0)
bpf: clang bpfclean
@echo ">> building bpf assets using clang"
$(MAKE) -C ./pkg/collector/bpf

# Clean existing bpf assets. When GOARCH is set we ALWAYS clean the
# assets as we need to build them for each architecture
.PHONY: bpfclean
ifdef GOARCH
bpfclean:
@echo ">> cleaning existing bpf assets"
$(MAKE) -C ./pkg/collector/bpf clean
endif

# Install clang using script. Do it only when GOARCH is set as we need
# clang to build go binaries inside golang-builder container.
.PHONY: clang
ifdef GOARCH
clang:
@echo ">> installing clang"
@./scripts/install_clang.sh
endif
endif

# Dont run swagger for release builds. This is due to cross compiling with GOARCH set
# to different archs and swag will be built in arch specific bin folder.
.PHONY: swag
Expand Down
29 changes: 17 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

| | |
| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| CI/CD | [![ci](https://github.com/mahendrapaipuri/ceems/workflows/CI/badge.svg)](https://github.com/mahendrapaipuri/ceems) [![CircleCI](https://dl.circleci.com/status-badge/img/circleci/8jSYT1wyKY8mKQRTqNLThX/TzM1Mr3AEAqmehnoCde19R/tree/main.svg?style=svg&circle-token=28db7268f3492790127da28e62e76b0991d59c8b)](https://dl.circleci.com/status-badge/redirect/circleci/8jSYT1wyKY8mKQRTqNLThX/TzM1Mr3AEAqmehnoCde19R/tree/main) [![Coverage](https://img.shields.io/badge/Coverage-75.9%25-brightgreen)](https://github.com/mahendrapaipuri/ceems/actions/workflows/ci.yml?query=branch%3Amain) |
| CI/CD | [![ci](https://github.com/mahendrapaipuri/ceems/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/mahendrapaipuri/ceems/actions/workflows/ci.yml?query=branch%3Amain) [![CircleCI](https://dl.circleci.com/status-badge/img/circleci/8jSYT1wyKY8mKQRTqNLThX/TzM1Mr3AEAqmehnoCde19R/tree/main.svg?style=svg&circle-token=28db7268f3492790127da28e62e76b0991d59c8b)](https://dl.circleci.com/status-badge/redirect/circleci/8jSYT1wyKY8mKQRTqNLThX/TzM1Mr3AEAqmehnoCde19R/tree/main) [![Coverage](https://img.shields.io/badge/Coverage-75.9%25-brightgreen)](https://github.com/mahendrapaipuri/ceems/actions/workflows/ci.yml?query=branch%3Amain) |
| Docs | [![docs](https://img.shields.io/badge/docs-passing-green?style=flat&link=https://mahendrapaipuri.github.io/ceems/docs/)](https://mahendrapaipuri.github.io/ceems/) |
| Package | [![Release](https://img.shields.io/github/v/release/mahendrapaipuri/ceems.svg?include_prereleases)](https://github.com/mahendrapaipuri/ceems/releases/latest) |
| Meta | [![GitHub License](https://img.shields.io/github/license/mahendrapaipuri/ceems)](https://github.com/mahendrapaipuri/ceems) [![Go Report Card](https://goreportcard.com/badge/github.com/mahendrapaipuri/ceems)](https://goreportcard.com/report/github.com/mahendrapaipuri/ceems) [![code style](https://img.shields.io/badge/code%20style-gofmt-blue.svg)](https://pkg.go.dev/cmd/gofmt) |
Expand All @@ -14,30 +14,35 @@
<img src="https://raw.githubusercontent.com/mahendrapaipuri/ceems/main/website/static/img/logo.png" width="200">
</p>

Compute Energy & Emissions Monitoring Stack (CEEMS) (pronounced as *kiːms*) contains
a Prometheus exporter to export metrics of compute instance units and a REST API
Compute Energy & Emissions Monitoring Stack (CEEMS) (pronounced as *kiːms*) contains
a Prometheus exporter to export metrics of compute instance units and a REST API
server that serves the metadata and aggregated metrics of each
compute unit. Optionally, it includes a TSDB load balancer that supports basic access
control on TSDB so that one user cannot access metrics of another user.

"Compute Unit" in the current context has a wider scope. It can be a batch job in HPC,
a VM in cloud, a pod in k8s, _etc_. The main objective of the repository is to quantify
a VM in cloud, a pod in k8s, *etc*. The main objective of the repository is to quantify
the energy consumed and estimate emissions by each "compute unit". The repository itself
does not provide any frontend apps to show dashboards and it is meant to use along
with Grafana and Prometheus to show statistics to users.

Although CEEMS was born out of a need to monitor energy and carbon footprint of compute
workloads, it supports monitoring performance metrics as well. In addition, it leverages
[eBPF](https://ebpf.io/what-is-ebpf/) framework to monitor IO and network metrics
in a resource manager agnostic way.

## Install CEEMS

> [!WARNING]
> DO NOT USE pre-release versions as the API has changed quite a lot between the
> [!WARNING]
> DO NOT USE pre-release versions as the API has changed quite a lot between the
pre-release and stable versions.

Installation instructions of CEEMS components can be found in
Installation instructions of CEEMS components can be found in
[docs](https://mahendrapaipuri.github.io/ceems/docs/category/installation).

## Visualizing metrics with Grafana

CEEMS is meant to be used with Grafana for visualization and below are some of the
CEEMS is meant to be used with Grafana for visualization and below are some of the
screenshots few possible metrics.

### Time series compute unit CPU metrics
Expand All @@ -46,7 +51,7 @@ screenshots few possible metrics.
<img src="https://raw.githubusercontent.com/mahendrapaipuri/ceems/main/website/static/img/dashboards/cpu_ts_stats.png" width="1200">
</p>

### Time series compute unit GPU metrics
### Time series compute unit GPU metrics

<p align="center">
<img src="https://raw.githubusercontent.com/mahendrapaipuri/ceems/main/website/static/img/dashboards/gpu_ts_stats.png" width="1200">
Expand All @@ -71,9 +76,9 @@ screenshots few possible metrics.

## Contributing

We welcome contributions to this project, we hope to see this project grow and become
a useful tool for people who are interested in the energy and carbon footprint of their
We welcome contributions to this project, we hope to see this project grow and become
a useful tool for people who are interested in the energy and carbon footprint of their
workloads.

Please feel free to open issues and/or discussions for any potential ideas of
Please feel free to open issues and/or discussions for any potential ideas of
improvement.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.22.5

require (
github.com/alecthomas/kingpin/v2 v2.4.0
github.com/cilium/ebpf v0.11.0
github.com/containerd/cgroups/v3 v3.0.4-0.20240117155926-c00d22e55fef
github.com/go-chi/httprate v0.14.1
github.com/go-kit/log v0.2.1
Expand All @@ -30,7 +31,6 @@ require (
github.com/alecthomas/units v0.0.0-20231202071711-9a357b53e9c9 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cilium/ebpf v0.11.0 // indirect
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/docker/go-units v0.5.0 // indirect
Expand Down
19 changes: 19 additions & 0 deletions internal/osexec/osexec.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package osexec

import (
"context"
"errors"
"math"
"os"
"os/exec"
Expand All @@ -18,6 +19,12 @@ const (
sudoCmd = "sudo"
)

// Custom errors.
var (
ErrInvalidUID = errors.New("invalid UID")
ErrInvalidGID = errors.New("invalid GID")
)

// Execute command and return stdout/stderr.
func Execute(cmd string, args []string, env []string, logger log.Logger) ([]byte, error) {
level.Debug(logger).Log("msg", "Executing", "command", cmd, "args", strings.Join(args, " "))
Expand Down Expand Up @@ -63,10 +70,14 @@ func ExecuteAs(cmd string, args []string, uid int, gid int, env []string, logger
var uidInt32, gidInt32 uint32
if uid > 0 && uid <= math.MaxInt32 {
uidInt32 = uint32(uid) //nolint:gosec
} else {
return nil, ErrInvalidUID
}

if gid > 0 && gid <= math.MaxInt32 {
gidInt32 = uint32(gid) //nolint:gosec
} else {
return nil, ErrInvalidGID
}

// According to setpgid docs (https://man7.org/linux/man-pages/man2/setpgid.2.html)
Expand Down Expand Up @@ -153,10 +164,14 @@ func ExecuteAsContext(
var uidInt32, gidInt32 uint32
if uid > 0 && uid <= math.MaxInt32 {
uidInt32 = uint32(uid) //nolint:gosec
} else {
return nil, ErrInvalidUID
}

if gid > 0 && gid <= math.MaxInt32 {
gidInt32 = uint32(gid) //nolint:gosec
} else {
return nil, ErrInvalidGID
}

// According to setpgid docs (https://man7.org/linux/man-pages/man2/setpgid.2.html)
Expand Down Expand Up @@ -268,10 +283,14 @@ func ExecuteAsWithTimeout(
var uidInt32, gidInt32 uint32
if uid > 0 && uid <= math.MaxInt32 {
uidInt32 = uint32(uid) //nolint:gosec
} else {
return nil, ErrInvalidUID
}

if gid > 0 && gid <= math.MaxInt32 {
gidInt32 = uint32(gid) //nolint:gosec
} else {
return nil, ErrInvalidGID
}

// According to setpgid docs (https://man7.org/linux/man-pages/man2/setpgid.2.html)
Expand Down
Loading