Skip to content

Commit

Permalink
koordlet: tc plugin for netqos
Browse files Browse the repository at this point in the history
Signed-off-by: lucming <2876757716@qq.com>
  • Loading branch information
lucming committed Mar 5, 2024
1 parent c590941 commit 4f2060b
Show file tree
Hide file tree
Showing 17 changed files with 1,406 additions and 12 deletions.
74 changes: 74 additions & 0 deletions apis/extension/netqos_tc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
Copyright 2022 The Koordinator Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package extension

import corev1 "k8s.io/api/core/v1"

type NetQosGlobalConfig struct {
HwTxBpsMax uint64 `json:"hw_tx_bps_max"`
HwRxBpsMax uint64 `json:"hw_rx_bps_max"`
L1TxBpsMin uint64 `json:"l1_tx_bps_min"`
L1TxBpsMax uint64 `json:"l1_tx_bps_max"`
L2TxBpsMin uint64 `json:"l2_tx_bps_min"`
L2TxBpsMax uint64 `json:"l2_tx_bps_max"`
L1RxBpsMin uint64 `json:"l1_rx_bps_min"`
L1RxBpsMax uint64 `json:"l1_rx_bps_max"`
L2RxBpsMin uint64 `json:"l2_rx_bps_min"`
L2RxBpsMax uint64 `json:"l2_rx_bps_max"`
}

type NetQoSClass string

const (
NETQoSHigh NetQoSClass = "high_class"
NETQoSMid NetQoSClass = "mid_class"
NETQoSLow NetQoSClass = "low_class"
NETQoSNone NetQoSClass = ""

NETQOSConfigPathForNode = "/var/run/koordinator/net/node"
NETQOSConfigPathForPod = "/var/run/koordinator/net/pods"
)

func GetPodNetQoSClassByName(qos string) NetQoSClass {
q := QoSClass(qos)

switch q {
case QoSSystem:
return NETQoSHigh
case QoSLSE, QoSLSR, QoSLS:
return NETQoSMid
case QoSBE:
return NETQoSLow
}

return NETQoSNone
}

func GetPodNetQoSClass(pod *corev1.Pod) NetQoSClass {
if pod == nil || pod.Labels == nil {
return NETQoSNone
}
return GetNetQoSClassByAttrs(pod.Labels, pod.Annotations)
}

func GetNetQoSClassByAttrs(labels, annotations map[string]string) NetQoSClass {
// annotations are for old format adaption reason
if q, exist := labels[LabelPodQoS]; exist {
return GetPodNetQoSClassByName(q)
}
return NETQoSNone
}
3 changes: 3 additions & 0 deletions config/crd/bases/slo.koordinator.sh_nodeslos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1267,6 +1267,9 @@ spec:
policies:
description: Policies of pod QoS.
properties:
NETQOSPolicy:
description: applied policy for the Net QoS, default = "tc"
type: string
cpuPolicy:
description: applied policy for the CPU QoS, default = "groupIdentity"
type: string
Expand Down
2 changes: 2 additions & 0 deletions docker/koordlet.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ARG TARGETARCH
ENV VERSION $VERSION
ENV GOOS linux
ENV GOARCH $TARGETARCH
ENV GOPROXY https://goproxy.cn,direct

COPY go.mod go.mod
COPY go.sum go.sum
Expand Down Expand Up @@ -35,6 +36,7 @@ RUN go build -a -o koordlet cmd/koordlet/main.go
FROM --platform=$TARGETPLATFORM nvidia/cuda:11.6.2-base-ubuntu20.04
WORKDIR /
RUN apt-get update && apt-get install -y lvm2 && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y iptables
COPY --from=builder /go/src/github.com/koordinator-sh/koordinator/koordlet .
COPY --from=builder /usr/local/lib /usr/lib
ENTRYPOINT ["/koordlet"]
Binary file added docs/images/netqos.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
57 changes: 47 additions & 10 deletions docs/proposals/koordlet/20231208-support-netqos.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ last-updated: 2023-12-08
- [Design Principles](#design-principles)
- [Implementation Details](#implementation-details)
- [koordlet:](#koordlet)
- [API](#api)
- [node level](#node-level)
- [pod level](#pod-level)
- [supported plugins](#supported-plugins)
- [external](#external-plugins)
- [internal](#internal-plugins)
- [api:](#api)
- [node level](#node-level)
- [pod level](#pod-level)
- [supported plugins:](#supported-plugins)
- [external plugins:](#external-plugins)
- [internal plugins:](#internal-plugins)
- [koord-scheduler](#koord-scheduler)
- [koord-descheduler](#koord-descheduler)
- [usage:](#usage)
Expand All @@ -49,12 +49,12 @@ last-updated: 2023-12-08
## Glossary

[ebpf](https://ebpf.io/what-is-ebpf/)

[ebpf tc](https://arthurchiao.art/blog/cilium-bpf-xdp-reference-guide-zh/#prog_type_tc)

[edt](https://arthurchiao.art/blog/better-bandwidth-management-with-ebpf-zh/#31-%E6%95%B4%E4%BD%93%E8%AE%BE%E8%AE%A1%E5%9F%BA%E4%BA%8E-bpfedt-%E5%AE%9E%E7%8E%B0%E5%AE%B9%E5%99%A8%E9%99%90%E9%80%9F)

[terway-qos](https://github.com/AliyunContainerService/terway-qos/blob/main/README-zh_CN.md)
[net_cls cgroup](https://www.kernel.org/doc/Documentation/cgroup-v1/net_cls.txt)
[tc](https://man7.org/linux/man-pages/man8/tc.8.html) (traffic control)
[ipset](https://linux.die.net/man/8/ipset)

## Summary

Expand Down Expand Up @@ -331,7 +331,44 @@ After that, the netqos plugin will implement the network limiting operation base

- <span style="color: beige; ">for node </span>
we can refer to the koordinator's API directly, to do some netqos operation.

The `koordlet` initializes the `tc`, `iptables`, and `ipset` rules according to `nodeslo` cr,
and then it only needs to watch the pod to update the `ipset` of the `tc` class corresponding to that pod

When `koordlet` starts, it creates the `tc` rules and the associated `ipset` objects on the physical `NIC` of the host.
Each `tc` `class` will correspond to an `ipset` rule. This `ipset` declares a group of pods. This group of pods has the same `tc` class priority,
and then share the network bandwidth in this `tc` class. By default, each `tc` `class` can use up all the network bandwidth of the node.
there are three classes defined, `high_class`/`mid_class`/`low_class` , each of pods will be matched to a `tc` class.

![image](/docs/images/netqos.jpg)

Logic for `htb qdisc` selection of specific classes:
1. The `htb` algorithm starts at the bottom of the `class` tree and works its way up to find `classes` with the `CAN_SEND` status.
2. If there are more than one `class` in the layer in the `CAN_SEND` state then the `class` with the highest priority (lowest value)
is selected. After each `class` has sent its own `quantum` bytes, it is the next `class`'s turn to send.

Configuration of parameters for the specific class corresponding to each priority pod:
| PRIO | HIGH | MID | LOW |
| ---- | ---- | ---- | ---- |
| net_prio | 0 | 1 | 2 |
| net_cls | 1:2 | 1:3 | 1:4 |
| htb.rate | 40% | 30% | 30% |
| htb.ceil | 100% | 100% | 100% |

Specific setup method:
```bash
# With an entire network bandwidth of 1000Mbit, the following rules are created.
tc qdisc add dev eth0 root handle 1:0 htb default 1
tc class add dev eth0 parent 1:0 classid 1:1 htb rate 1000Mbit
tc class add dev eth0 parent 1:1 classid 1:2 htb rate 400Mbit ceil 1000Mbit prio 0
tc class add dev eth0 parent 1:1 classid 1:3 htb rate 300Mbit ceil 1000Mbit prio 1
tc class add dev eth0 parent 1:1 classid 1:4 htb rate 300Mbit ceil 1000Mbit prio 2
ipset create high_class hash:net
iptables -t mangle -A POSTROUTING -m set --match-set high_class src -j CLASSIFY --set-class 1:2
ipset create mid_class hash:net
iptables -t mangle -A POSTROUTING -m set --match-set mid_class src -j CLASSIFY --set-class 1:3
ipset create low_class hash:net
iptables -t mangle -A POSTROUTING -m set --match-set low_class src -j CLASSIFY --set-class 1:4
```


#### koord-scheduler
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82
github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5
github.com/containerd/nri v0.3.0
github.com/coreos/go-iptables v0.5.0
github.com/docker/docker v20.10.21+incompatible
github.com/evanphx/json-patch v5.6.0+incompatible
github.com/fsnotify/fsnotify v1.6.0
Expand Down Expand Up @@ -185,7 +186,7 @@ require (
github.com/stretchr/objx v0.5.0 // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
github.com/ugorji/go/codec v1.2.7 // indirect
github.com/vishvananda/netlink v1.1.1-0.20210330154013-f5de75959ad5 // indirect
github.com/vishvananda/netlink v1.1.1-0.20210330154013-f5de75959ad5
github.com/vishvananda/netns v0.0.0-20210104183010-2eb08e3e575f // indirect
github.com/vmware/govmomi v0.30.0 // indirect
go.etcd.io/etcd/api/v3 v3.5.5 // indirect
Expand Down
1 change: 1 addition & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkE
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/etcd v3.3.13+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-iptables v0.4.5/go.mod h1:/mVI274lEDI2ns62jHCDnCyBF9Iwsmekav8Dbxlm1MU=
github.com/coreos/go-iptables v0.5.0 h1:mw6SAibtHKZcNzAsOxjoHIG0gy5YFHhypWSSNc6EjbQ=
github.com/coreos/go-iptables v0.5.0/go.mod h1:/mVI274lEDI2ns62jHCDnCyBF9Iwsmekav8Dbxlm1MU=
github.com/coreos/go-oidc v2.1.0+incompatible/go.mod h1:CgnwVTmzoESiwO9qyAFEMiHoZ1nMCKZlZ9V6mm3/LKc=
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
Expand Down
8 changes: 8 additions & 0 deletions pkg/koordlet/runtimehooks/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/gpu"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/groupidentity"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/terwayqos"

Check failure on line 34 in pkg/koordlet/runtimehooks/config.go

View workflow job for this annotation

GitHub Actions / golangci-lint

File is not `gofmt`-ed with `-s` (gofmt)
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/tc"
"github.com/koordinator-sh/koordinator/pkg/koordlet/util/system"
)

Expand Down Expand Up @@ -80,6 +81,11 @@ const (
// owner: @l1b0k
// alpha: v1.5
TerwayQoS featuregate.Feature = "TerwayQoS"

// NetQosByTC declines a network qos implementation based on tc.
// owner: @lucming
// alpha: v1.5
NetQosByTC featuregate.Feature = "TC"
)

var (
Expand All @@ -91,6 +97,7 @@ var (
CPUNormalization: {Default: false, PreRelease: featuregate.Alpha},
CoreSched: {Default: false, PreRelease: featuregate.Alpha},
TerwayQoS: {Default: false, PreRelease: featuregate.Alpha},
NetQosByTC: {Default: false, PreRelease: featuregate.Alpha},
}

runtimeHookPlugins = map[featuregate.Feature]HookPlugin{
Expand All @@ -101,6 +108,7 @@ var (
CPUNormalization: cpunormalization.Object(),
CoreSched: coresched.Object(),
TerwayQoS: terwayqos.Object(),
NetQosByTC: tc.Object(),
}
)

Expand Down
90 changes: 90 additions & 0 deletions pkg/koordlet/runtimehooks/hooks/tc/helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
Copyright 2022 The Koordinator Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package tc

import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/koordinator-sh/koordinator/apis/extension"
slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1"
)

func loadConfigFromNodeSlo(nodesloSpec *slov1alpha1.NodeSLOSpec) *extension.NetQosGlobalConfig {
res := extension.NetQosGlobalConfig{}
var total uint64 = 0
if nodesloSpec != nil && nodesloSpec.SystemStrategy != nil {
total = uint64(nodesloSpec.SystemStrategy.TotalNetworkBandwidth.Value())
res.HwRxBpsMax = total
res.HwTxBpsMax = total
}

if nodesloSpec.ResourceQOSStrategy == nil {
return &res
}

strategy := nodesloSpec.ResourceQOSStrategy
if strategy.LSClass != nil &&
strategy.LSClass.NetworkQOS != nil &&
*strategy.LSClass.NetworkQOS.Enable {
cur := strategy.LSClass.NetworkQOS
res.L1RxBpsMin = getBandwidthVal(total, cur.IngressRequest)
res.L1RxBpsMax = getBandwidthVal(total, cur.IngressLimit)
res.L1TxBpsMin = getBandwidthVal(total, cur.EgressRequest)
res.L1TxBpsMax = getBandwidthVal(total, cur.EgressLimit)
}

if strategy.BEClass != nil &&
strategy.BEClass.NetworkQOS != nil &&
*strategy.BEClass.NetworkQOS.Enable {
cur := strategy.BEClass.NetworkQOS
res.L2RxBpsMin = getBandwidthVal(total, cur.IngressRequest)
res.L2RxBpsMax = getBandwidthVal(total, cur.IngressLimit)
res.L2TxBpsMin = getBandwidthVal(total, cur.EgressRequest)
res.L2TxBpsMax = getBandwidthVal(total, cur.EgressLimit)
}

return &res
}

func getBandwidthVal(total uint64, intOrPercent *intstr.IntOrString) uint64 {
if intOrPercent == nil {
return 0
}

switch intOrPercent.Type {
case intstr.String:
return getBandwidthByQuantityFormat(intOrPercent.StrVal)
case intstr.Int:
return getBandwidthByPercentageFormat(total, intOrPercent.IntValue())
default:
return 0
}
}

func getBandwidthByQuantityFormat(quanityStr string) uint64 {
val, err := resource.ParseQuantity(quanityStr)
if err != nil {
return 0
}

return uint64(val.Value())
}

func getBandwidthByPercentageFormat(total uint64, percentage int) uint64 {
return total * uint64(percentage) / 100
}
Loading

0 comments on commit 4f2060b

Please sign in to comment.