diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 91ac17b90..333291e5a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -117,8 +117,7 @@ jobs: platforms: linux/amd64 push: true pull: true - build-args: | - MODULE=${{ matrix.target }} + file: docker/${{ matrix.target }}.dockerfile labels: | org.opencontainers.image.title=${{ matrix.target }} org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} diff --git a/.goreleaser.yaml b/.goreleaser.yaml index 59c931466..180d569cc 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -5,13 +5,13 @@ before: hooks: - go mod tidy builds: - - env: + - id: koord-runtime-proxy + env: - CGO_ENABLED=0 goos: - linux goarch: - amd64 - id: koord-runtime-proxy main: ./cmd/koord-runtime-proxy binary: koord-runtime-proxy ldflags: @@ -20,13 +20,13 @@ builds: - -X github.com/koordinator-sh/koordinator/pkg/version.buildDate={{ .Date }} - -X github.com/koordinator-sh/koordinator/pkg/version.gitCommit={{ .Commit }} - -X github.com/koordinator-sh/koordinator/pkg/version.gitTreeState=clean - - env: + - id: koord-manager + env: - CGO_ENABLED=0 goos: - linux goarch: - amd64 - id: koord-manager main: ./cmd/koord-manager binary: koord-manager ldflags: @@ -35,13 +35,13 @@ builds: - -X github.com/koordinator-sh/koordinator/pkg/version.buildDate={{ .Date }} - -X github.com/koordinator-sh/koordinator/pkg/version.gitCommit={{ .Commit }} - -X github.com/koordinator-sh/koordinator/pkg/version.gitTreeState=clean - - env: + - id: koord-scheduler + env: - CGO_ENABLED=0 goos: - linux goarch: - amd64 - id: koord-scheduler main: ./cmd/koord-scheduler binary: koord-scheduler ldflags: @@ -50,13 +50,13 @@ builds: - -X github.com/koordinator-sh/koordinator/pkg/version.buildDate={{ .Date }} - -X github.com/koordinator-sh/koordinator/pkg/version.gitCommit={{ .Commit }} - -X github.com/koordinator-sh/koordinator/pkg/version.gitTreeState=clean - - env: - - CGO_ENABLED=0 + - id: koordlet + env: + - CGO_ENABLED=1 goos: - linux goarch: - amd64 - id: koordlet main: ./cmd/koordlet binary: koordlet ldflags: @@ -87,10 +87,11 @@ changelog: - '^chore:' - '^feat(deps):' dockers: - - image_templates: + - id: koord-manager + image_templates: - "ghcr.io/{{.ProjectName}}/koord-manager:{{ .Version }}" - "registry.cn-beijing.aliyuncs.com/{{.ProjectName}}/koord-manager:{{ .Version }}" - dockerfile: .goreleaser/Dockerfile + dockerfile: .goreleaser/koord-manager.dockerfile build_flag_templates: - "--pull" - "--label=org.opencontainers.image.title=koord-manager" @@ -99,16 +100,15 @@ dockers: - "--label=org.opencontainers.image.revision={{.FullCommit}}" - "--label=org.opencontainers.image.version={{.Version}}" - "--label=org.opencontainers.image.licenses=Apache-2.0" - - "--build-arg=MODULE=koord-manager" - id: koord-manager ids: - koord-manager goos: linux goarch: amd64 - - image_templates: + - id: koordlet + image_templates: - "ghcr.io/{{.ProjectName}}/koordlet:{{ .Version }}" - "registry.cn-beijing.aliyuncs.com/{{.ProjectName}}/koordlet:{{ .Version }}" - dockerfile: .goreleaser/Dockerfile + dockerfile: .goreleaser/koordlet.dockerfile build_flag_templates: - "--pull" - "--label=org.opencontainers.image.title=koordlet" @@ -117,16 +117,15 @@ dockers: - "--label=org.opencontainers.image.revision={{.FullCommit}}" - "--label=org.opencontainers.image.version={{.Version}}" - "--label=org.opencontainers.image.licenses=Apache-2.0" - - "--build-arg=MODULE=koordlet" - id: koordlet ids: - koordlet goos: linux goarch: amd64 - - image_templates: + - id: koord-scheduler + image_templates: - "ghcr.io/{{.ProjectName}}/koord-scheduler:{{ .Version }}" - "registry.cn-beijing.aliyuncs.com/{{.ProjectName}}/koord-scheduler:{{ .Version }}" - dockerfile: .goreleaser/Dockerfile + dockerfile: .goreleaser/koord-scheduler.dockerfile build_flag_templates: - "--pull" - "--label=org.opencontainers.image.title=koord-scheduler" @@ -135,8 +134,6 @@ dockers: - "--label=org.opencontainers.image.revision={{.FullCommit}}" - "--label=org.opencontainers.image.version={{.Version}}" - "--label=org.opencontainers.image.licenses=Apache-2.0" - - "--build-arg=MODULE=koord-scheduler" - id: koord-scheduler ids: - koord-scheduler goos: linux diff --git a/.goreleaser/Dockerfile b/.goreleaser/Dockerfile deleted file mode 100644 index 8905d45fa..000000000 --- a/.goreleaser/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -# Use distroless as minimal base image to package the manager binary -# Refer to https://github.com/GoogleContainerTools/distroless for more details -FROM alpine:3.12 -RUN apk add --update bash net-tools iproute2 logrotate less rsync util-linux -WORKDIR / -ARG MODULE -COPY ${MODULE} . diff --git a/.goreleaser/koord-manager.dockerfile b/.goreleaser/koord-manager.dockerfile new file mode 100644 index 000000000..1a47c5224 --- /dev/null +++ b/.goreleaser/koord-manager.dockerfile @@ -0,0 +1,4 @@ +FROM gcr.io/distroless/static:latest +WORKDIR / +COPY koord-manager . +ENTRYPOINT ["/koord-manager"] diff --git a/.goreleaser/koord-scheduler.dockerfile b/.goreleaser/koord-scheduler.dockerfile new file mode 100644 index 000000000..e911b3415 --- /dev/null +++ b/.goreleaser/koord-scheduler.dockerfile @@ -0,0 +1,4 @@ +FROM gcr.io/distroless/static:latest +WORKDIR / +COPY koord-scheduler . +ENTRYPOINT ["/koord-scheduler"] diff --git a/.goreleaser/koordlet.dockerfile b/.goreleaser/koordlet.dockerfile new file mode 100644 index 000000000..ff6079600 --- /dev/null +++ b/.goreleaser/koordlet.dockerfile @@ -0,0 +1,4 @@ +FROM nvidia/cuda:11.6.1-base-ubuntu20.04 +WORKDIR / +COPY koordlet . +ENTRYPOINT ["/koordlet"] diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index b23c85ffb..000000000 --- a/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -# Build the manager binary -FROM golang:1.17-alpine as builder - -ARG MODULE - -WORKDIR /go/src/github.com/koordinator-sh/koordinator -RUN apk add --update make git bash rsync gcc musl-dev - -# Copy the go source -# Copy the Go Modules manifests -COPY go.mod go.mod -COPY go.sum go.sum -# cache deps before building and copying source so that we don't need to re-download as much -# and so that source changes don't invalidate our downloaded layer -RUN go mod download - -# Copy the go source -COPY apis/ apis/ -COPY cmd/ cmd/ -COPY pkg/ pkg/ - -# Build -ENV GOOS linux -ENV GOARCH amd64 -RUN go build -a -o ${MODULE} /go/src/github.com/koordinator-sh/koordinator/cmd/${MODULE}/main.go - -# Use distroless as minimal base image to package the manager binary -# Refer to https://github.com/GoogleContainerTools/distroless for more details -FROM alpine:3.12 -RUN apk add --update bash net-tools iproute2 logrotate less rsync util-linux -WORKDIR / -ARG MODULE -COPY --from=builder /go/src/github.com/koordinator-sh/koordinator/${MODULE} . diff --git a/Makefile b/Makefile index a80d7532d..82ebf8cf8 100644 --- a/Makefile +++ b/Makefile @@ -114,15 +114,15 @@ docker-build: test docker-build-koordlet docker-build-koord-manager docker-build .PHONY: docker-build-koordlet docker-build-koordlet: ## Build docker image with the koordlet. - docker build --build-arg MODULE=koordlet -t ${KOORDLET_IMG} . + docker build --pull -t ${KOORDLET_IMG} -f docker/koordlet.dockerfile . .PHONY: docker-build-koord-manager docker-build-koord-manager: ## Build docker image with the koord-manager. - docker build --build-arg MODULE=koord-manager -t ${KOORD_MANAGER_IMG} . + docker build --pull -t ${KOORD_MANAGER_IMG} -f docker/koord-manager.dockerfile . .PHONY: docker-build-koord-scheduler docker-build-koord-scheduler: ## Build docker image with the scheduler. - docker build --build-arg MODULE=koord-scheduler -t ${KOORD_SCHEDULER_IMG} . + docker build --pull -t ${KOORD_SCHEDULER_IMG} -f docker/koord-scheduler.dockerfile . .PHONY: docker-push docker-push: docker-push-koordlet docker-push-koord-manager docker-push-koord-scheduler diff --git a/README-zh_CN.md b/README-zh_CN.md new file mode 100644 index 000000000..ac285329f --- /dev/null +++ b/README-zh_CN.md @@ -0,0 +1,65 @@ +

+

Koordinator

+ Koordinator +

+ +[![License](https://img.shields.io/github/license/koordinator-sh/koordinator.svg?color=4EB1BA&style=flat-square)](https://opensource.org/licenses/Apache-2.0) +[![GitHub release](https://img.shields.io/github/v/release/koordinator-sh/koordinator.svg?style=flat-square)](https://github.com/koordinator-sh/koordinator/releases/latest) +[![CI](https://img.shields.io/github/workflow/status/koordinator-sh/koordinator/CI?label=CI&logo=github&style=flat-square)](https://github.com/koordinator-sh/koordinator/actions/workflows/ci.yaml) +[![Go Report Card](https://goreportcard.com/badge/github.com/koordinator-sh/koordinator?style=flat-square)](https://goreportcard.com/report/github.com/koordinator-sh/koordinator) +[![codecov](https://img.shields.io/codecov/c/github/koordinator-sh/koordinator?logo=codecov&style=flat-square)](https://codecov.io/github/koordinator-sh/koordinator) +[![PRs Welcome](https://badgen.net/badge/PRs/welcome/green?icon=https://api.iconify.design/octicon:git-pull-request.svg?color=white&style=flat-square)](CONTRIBUTING.md) +[![Slack](https://badgen.net/badge/slack/join/4A154B?icon=slack&style=flat-square)](https://join.slack.com/t/koordinator-sh/shared_invite/zt-1756qoub4-Cn4~esfdlfAPsD7cwO2NzA) + + +[English](./README.md) | 简体中文 + + + +## 介绍 + +Koordinator 基于 QoS 机制,支持 Kubernetes 上多种工作负载的混部调度。它旨在提高工作负载的运行时效率和可靠性(包括延迟敏感型负载和批处理任务),简化资源相关的配置调优,增加 Pod 部署密度以提高资源利用率。 + +Koordinator 通过提供如下功能来增强用户在 Kubernetes 上管理工作负载的体验: + +- 精心设计的 Priority 和 QoS 机制,支持在一个集群或者一个节点上混部不同的工作负载。 +- 采用应用画像机制(application profiling mechanism),支持超卖资源,以实现在满足 QoS 保障的前提下实现高资源利用率。 +- 细粒度的资源编排和隔离机制以提高工作负载(包括延迟敏感型负载和批处理任务)的效率。 +- 灵活的任务调度机制以支持特定领域(如大数据、AI、音频和视频)的工作负载。 +- 一套支持监控、故障排除、运维的工具集。 + +## 快速开始 + +你可以在 [Koordinator website](https://koordinator.sh/docs) 查看到完整的文档集。 + +- 安装/升级 Koordinator [最新版本](https://koordinator.sh/docs/installation) +- 参考[最佳实践](https://koordinator.sh/docs/best-practices/colocation-of-spark-jobs),里面有一些关于运行混部工作负载的示例。 + +## 行为守则 + +Koordinator 社区遵照[行为守则](CODE_OF_CONDUCT.md)。我们鼓励每个人在参与之前先读一下它。 + +为了营造一个开放和热情的环境,我们作为贡献者和维护者承诺:无论年龄、体型、残疾、种族、经验水平、教育程度、社会经济地位、国籍、个人外貌、种族、宗教或性认同和性取向如何,参与我们的项目和社区的每个人都不会受到骚扰。 + +## 贡献 + +我们非常欢迎每一位社区同学共同参与 Koordinator 的建设,你可以从 [CONTRIBUTING.md](CONTRIBUTING.md) 手册开始。 + +## 成员 + +我们鼓励所有贡献者成为成员。我们的目标是发展一个由贡献者、审阅者和代码所有者组成的活跃、健康的社区。在我们的[社区成员](docs/community/community-membership.md)页面,详细了解我们的成员要求和责任。 + +## 社区 + +你可以通过如下途径联系到项目维护者: + +- [Slack](https://join.slack.com/t/koordinator-sh/shared_invite/zt-1756qoub4-Cn4~esfdlfAPsD7cwO2NzA) +- 钉钉( Chinese ): 搜索群ID `33383887`或者扫描二维码加入 + +
+ Dingtalk QRCode +
+ +## License + +Koordinator is licensed under the Apache License, Version 2.0. See [LICENSE](./LICENSE) for the full license text. diff --git a/README.md b/README.md index f6f3c61ff..08c66e85f 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ [![PRs Welcome](https://badgen.net/badge/PRs/welcome/green?icon=https://api.iconify.design/octicon:git-pull-request.svg?color=white&style=flat-square)](CONTRIBUTING.md) [![Slack](https://badgen.net/badge/slack/join/4A154B?icon=slack&style=flat-square)](https://join.slack.com/t/koordinator-sh/shared_invite/zt-1756qoub4-Cn4~esfdlfAPsD7cwO2NzA) +English | [简体中文](./README-zh_CN.md) ## Introduction Koordinator is a QoS based scheduling system for hybrid orchestration workloads on Kubernetes. It aims to improve the @@ -42,7 +43,7 @@ before participating. In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, -disability, ethnicity, gender identity and expression, level of experience, education, socio-economic status, +disability, ethnicity, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Contributing diff --git a/apis/extension/node.go b/apis/extension/node.go index 8a87fa267..be16c2247 100644 --- a/apis/extension/node.go +++ b/apis/extension/node.go @@ -20,6 +20,8 @@ import ( "encoding/json" "k8s.io/apimachinery/pkg/types" + + schedulingconfig "github.com/koordinator-sh/koordinator/apis/scheduling/config" ) const ( @@ -30,6 +32,22 @@ const ( // AnnotationNodeCPUSharedPools describes the CPU Shared Pool defined by Koordinator. // The shared pool is mainly used by Koordinator LS Pods or K8s Burstable Pods. AnnotationNodeCPUSharedPools = NodeDomainPrefix + "/cpu-shared-pools" + + // LabelNodeCPUBindPolicy constrains how to bind CPU logical CPUs when scheduling. + LabelNodeCPUBindPolicy = NodeDomainPrefix + "/cpu-bind-policy" + // LabelNodeNUMAAllocateStrategy indicates how to choose satisfied NUMA Nodes when scheduling. + LabelNodeNUMAAllocateStrategy = NodeDomainPrefix + "/numa-allocate-strategy" +) + +const ( + // NodeCPUBindPolicyFullPCPUsOnly requires that the scheduler must allocate full physical cores. + // Equivalent to kubelet CPU manager policy option full-pcpus-only=true. + NodeCPUBindPolicyFullPCPUsOnly = "FullPCPUsOnly" +) + +const ( + NodeNUMAAllocateStrategyLeastAllocated = string(schedulingconfig.NUMALeastAllocated) + NodeNUMAAllocateStrategyMostAllocated = string(schedulingconfig.NUMAMostAllocated) ) type CPUTopology struct { @@ -77,3 +95,16 @@ func GetPodCPUAllocs(annotations map[string]string) (PodCPUAllocs, error) { } return allocs, nil } + +func GetNodeCPUSharePools(nodeTopoAnnotations map[string]string) ([]CPUSharedPool, error) { + var cpuSharePools []CPUSharedPool + data, ok := nodeTopoAnnotations[AnnotationNodeCPUSharedPools] + if !ok { + return cpuSharePools, nil + } + err := json.Unmarshal([]byte(data), &cpuSharePools) + if err != nil { + return nil, err + } + return cpuSharePools, nil +} diff --git a/apis/extension/pod.go b/apis/extension/pod.go index bcc8cf97e..40f924949 100644 --- a/apis/extension/pod.go +++ b/apis/extension/pod.go @@ -27,7 +27,7 @@ import ( const ( AnnotationPodCPUBurst = DomainPrefix + "cpuBurst" - AnnotationPodMemoryQoS = DomainPrefix + "memoryQoS" + AnnotationPodMemoryQoS = DomainPrefix + "memoryQOS" ) func GetPodCPUBurstConfig(pod *corev1.Pod) (*slov1aplhpa1.CPUBurstConfig, error) { @@ -47,7 +47,7 @@ func GetPodCPUBurstConfig(pod *corev1.Pod) (*slov1aplhpa1.CPUBurstConfig, error) return &cpuBurst, nil } -func GetPodMemoryQoSConfig(pod *corev1.Pod) (*slov1aplhpa1.PodMemoryQoSConfig, error) { +func GetPodMemoryQoSConfig(pod *corev1.Pod) (*slov1aplhpa1.PodMemoryQOSConfig, error) { if pod == nil || pod.Annotations == nil { return nil, nil } @@ -55,7 +55,7 @@ func GetPodMemoryQoSConfig(pod *corev1.Pod) (*slov1aplhpa1.PodMemoryQoSConfig, e if !exist { return nil, nil } - cfg := slov1aplhpa1.PodMemoryQoSConfig{} + cfg := slov1aplhpa1.PodMemoryQOSConfig{} err := json.Unmarshal([]byte(value), &cfg) if err != nil { return nil, err diff --git a/apis/extension/resource.go b/apis/extension/resource.go index ec0ebad68..49328f35b 100644 --- a/apis/extension/resource.go +++ b/apis/extension/resource.go @@ -28,6 +28,12 @@ const ( BatchCPU corev1.ResourceName = DomainPrefix + "batch-cpu" BatchMemory corev1.ResourceName = DomainPrefix + "batch-memory" + GPUCore corev1.ResourceName = DomainPrefix + "gpu-core" + GPUMemory corev1.ResourceName = DomainPrefix + "gpu-memory" + GPUMemoryRatio corev1.ResourceName = DomainPrefix + "gpu-memory-ratio" +) + +const ( // AnnotationResourceSpec represents resource allocation API defined by Koordinator. // The user specifies the desired CPU orchestration policy by setting the annotation. AnnotationResourceSpec = SchedulingDomainPrefix + "/resource-spec" @@ -49,6 +55,8 @@ var ( type ResourceSpec struct { // PreferredCPUBindPolicy represents best-effort CPU bind policy. PreferredCPUBindPolicy CPUBindPolicy `json:"preferredCPUBindPolicy,omitempty"` + // PreferredCPUExclusivePolicy represents best-effort CPU exclusive policy. + PreferredCPUExclusivePolicy CPUExclusivePolicy `json:"preferredCPUExclusivePolicy,omitempty"` } // ResourceStatus describes resource allocation result, such as how to bind CPU. @@ -64,8 +72,8 @@ type ResourceStatus struct { type CPUBindPolicy = schedulingconfig.CPUBindPolicy const ( - // CPUBindPolicyNone does not perform any bind policy - CPUBindPolicyNone CPUBindPolicy = schedulingconfig.CPUBindPolicyNone + // CPUBindPolicyDefault performs the default bind policy that specified in koord-scheduler configuration + CPUBindPolicyDefault CPUBindPolicy = schedulingconfig.CPUBindPolicyDefault // CPUBindPolicyFullPCPUs favor cpuset allocation that pack in few physical cores CPUBindPolicyFullPCPUs CPUBindPolicy = schedulingconfig.CPUBindPolicyFullPCPUs // CPUBindPolicySpreadByPCPUs favor cpuset allocation that evenly allocate logical cpus across physical cores @@ -74,6 +82,17 @@ const ( CPUBindPolicyConstrainedBurst CPUBindPolicy = schedulingconfig.CPUBindPolicyConstrainedBurst ) +type CPUExclusivePolicy = schedulingconfig.CPUExclusivePolicy + +const ( + // CPUExclusivePolicyNone does not perform any exclusive policy + CPUExclusivePolicyNone CPUExclusivePolicy = schedulingconfig.CPUExclusivePolicyNone + // CPUExclusivePolicyPCPULevel represents mutual exclusion in the physical core dimension + CPUExclusivePolicyPCPULevel CPUExclusivePolicy = schedulingconfig.CPUExclusivePolicyPCPULevel + // CPUExclusivePolicyNUMANodeLevel indicates mutual exclusion in the NUMA topology dimension + CPUExclusivePolicyNUMANodeLevel CPUExclusivePolicy = schedulingconfig.CPUExclusivePolicyNUMANodeLevel +) + type NUMACPUSharedPools []CPUSharedPool type CPUSharedPool struct { @@ -85,7 +104,7 @@ type CPUSharedPool struct { // GetResourceSpec parses ResourceSpec from annotations func GetResourceSpec(annotations map[string]string) (*ResourceSpec, error) { resourceSpec := &ResourceSpec{ - PreferredCPUBindPolicy: schedulingconfig.CPUBindPolicyNone, + PreferredCPUBindPolicy: schedulingconfig.CPUBindPolicyDefault, } data, ok := annotations[AnnotationResourceSpec] if !ok { diff --git a/apis/extension/scheduling.go b/apis/extension/scheduling.go index f73bc64a4..5eaa1d827 100644 --- a/apis/extension/scheduling.go +++ b/apis/extension/scheduling.go @@ -18,14 +18,45 @@ package extension import ( "encoding/json" + "time" corev1 "k8s.io/api/core/v1" + + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" ) const ( // AnnotationCustomUsageThresholds represents the user-defined resource utilization threshold. // For specific value definitions, see CustomUsageThresholds AnnotationCustomUsageThresholds = SchedulingDomainPrefix + "/usage-thresholds" + + // AnnotationReservationAllocated represents the reservation allocated by the pod. + AnnotationReservationAllocated = SchedulingDomainPrefix + "/reservation-allocated" +) + +type Status string + +//Gang scheduling consts +const ( + GangAnnotationPrefix = "gang.scheduling.koordinator.sh" + DefaultGangWaitTime = 2 * time.Minute + + StrictMode = "StrictMode" + NonStrictMode = "NonStrictMode" + + // Gang's Annotation + GangNameAnnotation = GangAnnotationPrefix + "/name" + GangMinNumAnnotation = GangAnnotationPrefix + "/min-available" + GangWaitTimeAnnotation = GangAnnotationPrefix + "/waiting-time" + GangTotalNumAnnotation = GangAnnotationPrefix + "/total-number" + GangModeAnnotation = GangAnnotationPrefix + "/gang-mode" + GangGroupsAnnotation = GangAnnotationPrefix + "/groups" + GangTimeOutAnnotation = GangAnnotationPrefix + "/timeout" + + //Permit internal status + GangNotFoundInCache Status = "Gang not found in cache" + Success Status = "Success" + Wait Status = "Wait" ) // CustomUsageThresholds supports user-defined node resource utilization thresholds. @@ -45,3 +76,36 @@ func GetCustomUsageThresholds(node *corev1.Node) (*CustomUsageThresholds, error) } return usageThresholds, nil } + +type ReservationAllocated struct { + Namespace string `json:"namespace,omitempty"` + Name string `json:"name,omitempty"` +} + +func GetReservationAllocated(pod *corev1.Pod) (*ReservationAllocated, error) { + if pod.Annotations == nil { + return nil, nil + } + data, ok := pod.Annotations[AnnotationReservationAllocated] + if !ok { + return nil, nil + } + reservationAllocated := &ReservationAllocated{} + err := json.Unmarshal([]byte(data), reservationAllocated) + if err != nil { + return nil, err + } + return reservationAllocated, nil +} + +func SetReservationAllocated(pod *corev1.Pod, r *schedulingv1alpha1.Reservation) { + if pod.Annotations == nil { + pod.Annotations = map[string]string{} + } + reservationAllocated := &ReservationAllocated{ + Namespace: r.Namespace, + Name: r.Name, + } + data, _ := json.Marshal(reservationAllocated) // assert no error + pod.Annotations[AnnotationReservationAllocated] = string(data) +} diff --git a/apis/scheduling/config/types.go b/apis/scheduling/config/types.go index f14d4c78d..4a83dbd88 100644 --- a/apis/scheduling/config/types.go +++ b/apis/scheduling/config/types.go @@ -73,17 +73,16 @@ type ScoringStrategy struct { type NodeNUMAResourceArgs struct { metav1.TypeMeta - PreferredCPUBindPolicy CPUBindPolicy `json:"preferredCPUBindPolicy,omitempty"` - NUMAAllocateStrategy NUMAAllocateStrategy `json:"numaAllocateStrategy,omitempty"` - ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"` + DefaultCPUBindPolicy CPUBindPolicy `json:"defaultCPUBindPolicy,omitempty"` + ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"` } // CPUBindPolicy defines the CPU binding policy type CPUBindPolicy string const ( - // CPUBindPolicyNone does not perform any bind policy - CPUBindPolicyNone CPUBindPolicy = "None" + // CPUBindPolicyDefault performs the default bind policy that specified in koord-scheduler configuration + CPUBindPolicyDefault CPUBindPolicy = "Default" // CPUBindPolicyFullPCPUs favor cpuset allocation that pack in few physical cores CPUBindPolicyFullPCPUs CPUBindPolicy = "FullPCPUs" // CPUBindPolicySpreadByPCPUs favor cpuset allocation that evenly allocate logical cpus across physical cores @@ -92,6 +91,17 @@ const ( CPUBindPolicyConstrainedBurst CPUBindPolicy = "ConstrainedBurst" ) +type CPUExclusivePolicy string + +const ( + // CPUExclusivePolicyNone does not perform any exclusive policy + CPUExclusivePolicyNone CPUExclusivePolicy = "None" + // CPUExclusivePolicyPCPULevel represents mutual exclusion in the physical core dimension + CPUExclusivePolicyPCPULevel CPUExclusivePolicy = "PCPULevel" + // CPUExclusivePolicyNUMANodeLevel indicates mutual exclusion in the NUMA topology dimension + CPUExclusivePolicyNUMANodeLevel CPUExclusivePolicy = "NUMANodeLevel" +) + // NUMAAllocateStrategy indicates how to choose satisfied NUMA Nodes type NUMAAllocateStrategy string diff --git a/apis/scheduling/config/v1beta2/defaults.go b/apis/scheduling/config/v1beta2/defaults.go index 800f3e143..78f4c0eaa 100644 --- a/apis/scheduling/config/v1beta2/defaults.go +++ b/apis/scheduling/config/v1beta2/defaults.go @@ -41,7 +41,6 @@ var ( } defaultPreferredCPUBindPolicy = CPUBindPolicyFullPCPUs - defaultNUMAAllocateStrategy = NUMAMostAllocated defaultNodeNUMAResourceScoringStrategy = &ScoringStrategy{ Type: MostAllocated, Resources: []schedconfig.ResourceSpec{ @@ -74,11 +73,8 @@ func SetDefaults_LoadAwareSchedulingArgs(obj *LoadAwareSchedulingArgs) { // SetDefaults_NodeNUMAResourceArgs sets the default parameters for NodeNUMANodeResource plugin. func SetDefaults_NodeNUMAResourceArgs(obj *NodeNUMAResourceArgs) { - if obj.PreferredCPUBindPolicy == "" { - obj.PreferredCPUBindPolicy = defaultPreferredCPUBindPolicy - } - if obj.NUMAAllocateStrategy == "" { - obj.NUMAAllocateStrategy = defaultNUMAAllocateStrategy + if obj.DefaultCPUBindPolicy == "" { + obj.DefaultCPUBindPolicy = defaultPreferredCPUBindPolicy } if obj.ScoringStrategy == nil { obj.ScoringStrategy = defaultNodeNUMAResourceScoringStrategy diff --git a/apis/scheduling/config/v1beta2/types.go b/apis/scheduling/config/v1beta2/types.go index d7407e89b..b446a0a8a 100644 --- a/apis/scheduling/config/v1beta2/types.go +++ b/apis/scheduling/config/v1beta2/types.go @@ -71,11 +71,10 @@ type ScoringStrategy struct { // NodeNUMAResourceArgs holds arguments used to configure the NodeNUMAResource plugin. type NodeNUMAResourceArgs struct { - metav1.TypeMeta `json:",inline"` + metav1.TypeMeta - PreferredCPUBindPolicy CPUBindPolicy `json:"preferredCPUBindPolicy,omitempty"` - NUMAAllocateStrategy NUMAAllocateStrategy `json:"numaAllocateStrategy,omitempty"` - ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"` + DefaultCPUBindPolicy CPUBindPolicy `json:"defaultCPUBindPolicy,omitempty"` + ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"` } // CPUBindPolicy defines the CPU binding policy @@ -92,6 +91,17 @@ const ( CPUBindPolicyConstrainedBurst CPUBindPolicy = "ConstrainedBurst" ) +type CPUExclusivePolicy string + +const ( + // CPUExclusivePolicyNone does not perform any exclusive policy + CPUExclusivePolicyNone CPUExclusivePolicy = "None" + // CPUExclusivePolicyPCPULevel represents mutual exclusion in the physical core dimension + CPUExclusivePolicyPCPULevel CPUExclusivePolicy = "PCPULevel" + // CPUExclusivePolicyNUMANodeLevel indicates mutual exclusion in the NUMA topology dimension + CPUExclusivePolicyNUMANodeLevel CPUExclusivePolicy = "NUMANodeLevel" +) + // NUMAAllocateStrategy indicates how to choose satisfied NUMA Nodes type NUMAAllocateStrategy string diff --git a/apis/scheduling/config/v1beta2/zz_generated.conversion.go b/apis/scheduling/config/v1beta2/zz_generated.conversion.go index 24342863f..02f495529 100644 --- a/apis/scheduling/config/v1beta2/zz_generated.conversion.go +++ b/apis/scheduling/config/v1beta2/zz_generated.conversion.go @@ -100,8 +100,7 @@ func Convert_config_LoadAwareSchedulingArgs_To_v1beta2_LoadAwareSchedulingArgs(i } func autoConvert_v1beta2_NodeNUMAResourceArgs_To_config_NodeNUMAResourceArgs(in *NodeNUMAResourceArgs, out *config.NodeNUMAResourceArgs, s conversion.Scope) error { - out.PreferredCPUBindPolicy = config.CPUBindPolicy(in.PreferredCPUBindPolicy) - out.NUMAAllocateStrategy = config.NUMAAllocateStrategy(in.NUMAAllocateStrategy) + out.DefaultCPUBindPolicy = config.CPUBindPolicy(in.DefaultCPUBindPolicy) out.ScoringStrategy = (*config.ScoringStrategy)(unsafe.Pointer(in.ScoringStrategy)) return nil } @@ -112,8 +111,7 @@ func Convert_v1beta2_NodeNUMAResourceArgs_To_config_NodeNUMAResourceArgs(in *Nod } func autoConvert_config_NodeNUMAResourceArgs_To_v1beta2_NodeNUMAResourceArgs(in *config.NodeNUMAResourceArgs, out *NodeNUMAResourceArgs, s conversion.Scope) error { - out.PreferredCPUBindPolicy = CPUBindPolicy(in.PreferredCPUBindPolicy) - out.NUMAAllocateStrategy = NUMAAllocateStrategy(in.NUMAAllocateStrategy) + out.DefaultCPUBindPolicy = CPUBindPolicy(in.DefaultCPUBindPolicy) out.ScoringStrategy = (*ScoringStrategy)(unsafe.Pointer(in.ScoringStrategy)) return nil } diff --git a/apis/scheduling/v1alpha1/device_types.go b/apis/scheduling/v1alpha1/device_types.go new file mode 100644 index 000000000..6a2793971 --- /dev/null +++ b/apis/scheduling/v1alpha1/device_types.go @@ -0,0 +1,89 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type DeviceType string + +const ( + GPU DeviceType = "gpu" + FPGA DeviceType = "fpga" + RDMA DeviceType = "rdma" +) + +type DeviceSpec struct { + Devices []DeviceInfo `json:"devices"` +} + +type DeviceInfo struct { + // UUID represents the UUID of device + UUID string `json:"id,omitempty"` + // Minor represents the Minor number of Device, starting from 0 + Minor int32 `json:"minor,omitempty"` + // Type represents the type of device + Type DeviceType `json:"type,omitempty"` + // Health indicates whether the device is normal + Health bool `json:"health,omitempty"` + // Resources is a set of (resource name, quantity) pairs + Resources corev1.ResourceList `json:"resources,omitempty"` +} + +type DeviceStatus struct { + Allocations []DeviceAllocation `json:"allocations"` +} + +type DeviceAllocation struct { + Type DeviceType `json:"type"` + Entries []DeviceAllocationItem `json:"entries"` +} + +type DeviceAllocationItem struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + UUID string `json:"uuid"` + Devices []string `json:"devices"` +} + +// +genclient +// +genclient:nonNamespaced +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Cluster + +type Device struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec DeviceSpec `json:"spec,omitempty"` + Status DeviceStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +type DeviceList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + + Items []Device `json:"items"` +} + +func init() { + SchemeBuilder.Register(&Device{}, &DeviceList{}) +} diff --git a/apis/scheduling/v1alpha1/pod_migration_job_types.go b/apis/scheduling/v1alpha1/pod_migration_job_types.go new file mode 100644 index 000000000..9f7fced76 --- /dev/null +++ b/apis/scheduling/v1alpha1/pod_migration_job_types.go @@ -0,0 +1,206 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! +// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. + +type PodMigrationJobSpec struct { + // Paused indicates whether the PodMigrationJob should to work or not. + // Default is false + // +optional + Paused bool `json:"paused,omitempty"` + + // TTL controls the PodMigrationJob timeout duration. + // +optional + TTL *metav1.Duration `json:"ttl,omitempty"` + + // Mode represents the operating mode of the Job + // Default is PodMigrationJobModeReservationFirst + // +optional + Mode PodMigrationJobMode `json:"mode,omitempty"` + + // PodRef represents the Pod that be migrated + // +required + PodRef *corev1.ObjectReference `json:"podRef"` + + // ReservationOptions defines the Reservation options for migrated Pod + // +optional + ReservationOptions *PodMigrateReservationOptions `json:"reservationOptions,omitempty"` + + // DeleteOptions defines the deleting options for the migrated Pod and preempted Pods + // +optional + DeleteOptions *metav1.DeleteOptions `json:"deleteOptions,omitempty"` +} + +type PodMigrationJobMode string + +const ( + PodMigrationJobModeReservationFirst PodMigrationJobMode = "ReservationFirst" + PodMigrationJobModeEvictionDirectly PodMigrationJobMode = "EvictDirectly" +) + +type PodMigrateReservationOptions struct { + // ReservationRef if specified, PodMigrationJob will check if the status of Reservation is available. + // ReservationRef if not specified, PodMigrationJob controller will create Reservation by Template, + // and update the ReservationRef to reference the Reservation + // +optional + ReservationRef *corev1.ObjectReference `json:"reservationRef,omitempty"` + + // Template is the object that describes the Reservation that will be created if not specified ReservationRef + // +optional + Template *ReservationTemplateSpec `json:"template,omitempty"` + + // PreemptionOption decides whether to preempt other Pods. + // The preemption is safe and reserves resources for preempted Pods. + // +optional + PreemptionOptions *PodMigrationJobPreemptionOptions `json:"preemptionOptions,omitempty"` +} + +type PodMigrationJobPreemptionOptions struct { + // Reserved object. +} + +type PodMigrationJobStatus struct { + // PodMigrationJobPhase represents the phase of a PodMigrationJob is a simple, high-level summary of where the PodMigrationJob is in its lifecycle. + // e.g. Pending/Running/Failed + Phase PodMigrationJobPhase `json:"phase,omitempty"` + // Status represents the current status of PodMigrationJob + // e.g. ReservationCreated + Status string `json:"state,omitempty"` + // Reason represents a brief CamelCase message indicating details about why the PodMigrationJob is in this state. + Reason string `json:"reason,omitempty"` + // Message represents a human-readable message indicating details about why the PodMigrationJob is in this state. + Message string `json:"message,omitempty"` + // Conditions records the stats of PodMigrationJob + Conditions []PodMigrationJobCondition `json:"conditions,omitempty"` + // NodeName represents the node's name of migrated Pod + NodeName string `json:"nodeName,omitempty"` + // PodsRef represents the newly created Pods after being migrated + PodsRef []corev1.ObjectReference `json:"podsRef,omitempty"` + // PreemptedPodsRef represents the Pods that be preempted + PreemptedPodsRef []corev1.ObjectReference `json:"preemptedPodsRef,omitempty"` + // PreemptedPodsReservations records information about Reservations created due to preemption + PreemptedPodsReservations []PodMigrationJobPreemptedReservation `json:"preemptedPodsReservation,omitempty"` +} + +type PodMigrationJobPreemptedReservation struct { + // Namespace represents the namespace of Reservation + Namespace string `json:"namespace,omitempty"` + // Name represents the name of Reservation + Name string `json:"name,omitempty"` + // NodeName represents the assigned node for Reservation by scheduler + NodeName string `json:"nodeName,omitempty"` + // Phase represents the Phase of Reservation + Phase string `json:"phase,omitempty"` + // PreemptedPodRef represents the Pod that be preempted + PreemptedPodRef *corev1.ObjectReference `json:"preemptedPodRef,omitempty"` + // PodsRef represents the newly created Pods after being preempted + PodsRef []corev1.ObjectReference `json:"podsRef,omitempty"` +} + +type PodMigrationJobCondition struct { + // Type is the type of the condition. + Type PodMigrationJobConditionType `json:"type"` + // Status is the status of the condition. + // Can be True, False, Unknown. + Status PodMigrationJobConditionStatus `json:"status"` + // Last time we probed the condition. + // +nullable + LastProbeTime metav1.Time `json:"lastProbeTime,omitempty"` + // Last time the condition transitioned from one status to another. + // +nullable + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` + // Unique, one-word, CamelCase reason for the condition's last transition. + Reason string `json:"reason,omitempty"` + // Human-readable message indicating details about last transition. + Message string `json:"message,omitempty"` +} + +type PodMigrationJobPhase string + +const ( + // PodMigrationJobPending represents the initial status + PodMigrationJobPending PodMigrationJobPhase = "Pending" + // PodMigrationJobRunning represents the PodMigrationJob is being processed + PodMigrationJobRunning PodMigrationJobPhase = "Running" + // PodMigrationJobSucceed represents the PodMigrationJob processed successfully + PodMigrationJobSucceed PodMigrationJobPhase = "Succeed" + // PodMigrationJobFailed represents the PodMigrationJob process failed caused by Timeout, Reservation failed, etc. + PodMigrationJobFailed PodMigrationJobPhase = "Failed" + // PodMigrationJobAborted represents the user forcefully aborted the PodMigrationJob. + PodMigrationJobAborted PodMigrationJobPhase = "Aborted" +) + +type PodMigrationJobConditionType string + +// These are valid conditions of PodMigrationJob. +const ( + PodMigrationJobConditionReservationCreated PodMigrationJobConditionType = "ReservationCreated" + PodMigrationJobConditionReservationScheduled PodMigrationJobConditionType = "ReservationScheduled" + PodMigrationJobConditionWaitForConfirmPreempt PodMigrationJobConditionType = "WaitForConfirmPreempt" + PodMigrationJobConditionPreempting PodMigrationJobConditionType = "Preempting" + PodMigrationJobConditionEvicting PodMigrationJobConditionType = "Evicting" + PodMigrationJobConditionReservationWaitForBind PodMigrationJobConditionType = "WaitForBind" +) + +type PodMigrationJobConditionStatus string + +const ( + PodMigrationJobConditionStatusTrue PodMigrationJobConditionStatus = "True" + PodMigrationJobConditionStatusFalse PodMigrationJobConditionStatus = "False" + PodMigrationJobConditionStatusUnknown PodMigrationJobConditionStatus = "Unknown" +) + +// PodMigrationJob is the Schema for the PodMigrationJob API +// +k8s:openapi-gen=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +genclient +// +genclient:nonNamespaced +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="The phase of PodMigrationJob" +// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:printcolumn:name="Node",type="string",JSONPath=".status.nodeName" +// +kubebuilder:printcolumn:name="TTL",type="string",JSONPath=".spec.ttl" + +type PodMigrationJob struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec PodMigrationJobSpec `json:"spec,omitempty"` + Status PodMigrationJobStatus `json:"status,omitempty"` +} + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// PodMigrationJobList contains a list of PodMigrationJob +type PodMigrationJobList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []PodMigrationJob `json:"items"` +} + +func init() { + SchemeBuilder.Register(&PodMigrationJob{}, &PodMigrationJobList{}) +} diff --git a/apis/scheduling/v1alpha1/reservation_types.go b/apis/scheduling/v1alpha1/reservation_types.go index a8cf072cc..1e9f8ae57 100644 --- a/apis/scheduling/v1alpha1/reservation_types.go +++ b/apis/scheduling/v1alpha1/reservation_types.go @@ -32,41 +32,77 @@ type ReservationSpec struct { // like a normal pod. // If the `template.spec.nodeName` is specified, the scheduler will not choose another node but reserve resources on // the specified node. + // +optional Template *corev1.PodTemplateSpec `json:"template,omitempty"` // Specify the owners who can allocate the reserved resources. - // Multiple owner selectors and ANDed. + // Multiple owner selectors and ORed. + // +optional Owners []ReservationOwner `json:"owners,omitempty"` // By default, the resources requirements of reservation (specified in `template.spec`) is filtered by whether the // node has sufficient free resources (i.e. ReservationRequest < NodeFree). // When `preAllocation` is set, the scheduler will skip this validation and allow overcommitment. The scheduled // reservation would be waiting to be available until free resources are sufficient. + // +optional PreAllocation bool `json:"preAllocation,omitempty"` // Time-to-Live period for the reservation. - // `expires` and `ttl` are mutually exclusive. Defaults to 24h. + // `expires` and `ttl` are mutually exclusive. Defaults to 24h. Set 0 to disable expiration. + // +kubebuilder:default='24h' + // +optional TTL *metav1.Duration `json:"ttl,omitempty"` - // Expired timestamp when the reservation expires. + // Expired timestamp when the reservation is expected to expire. + // If both `expires` and `ttl` are set, `expires` is checked first. // `expires` and `ttl` are mutually exclusive. Defaults to being set dynamically at runtime based on the `ttl`. + // +optional Expires *metav1.Time `json:"expires,omitempty"` } +// ReservationTemplateSpec describes the data a Reservation should have when created from a template +type ReservationTemplateSpec struct { + // Standard object's metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired behavior of the Reservation. + // +optional + Spec ReservationSpec `json:"spec,omitempty"` +} + type ReservationStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster // Important: Run "make" to regenerate code after modifying this file // The `phase` indicates whether is reservation is waiting for process (`Pending`), available to allocate // (`Available`) or expired to get cleanup (Expired). + // +optional Phase ReservationPhase `json:"phase,omitempty"` + // The `expired` indicates the timestamp if the reservation is expired. + // +optional + Expired *metav1.Time `json:"expired,omitempty"` // The `conditions` indicate the messages of reason why the reservation is still pending. + // +optional Conditions []ReservationCondition `json:"conditions,omitempty"` // Current resource owners which allocated the reservation resources. + // +optional CurrentOwners []corev1.ObjectReference `json:"currentOwners,omitempty"` + // Name of node the reservation is scheduled on. + // +optional + NodeName string `json:"nodeName,omitempty"` + // Resource reserved and allocatable for owners. + // +optional + Allocatable corev1.ResourceList `json:"allocatable,omitempty"` + // Resource allocated by current owners. + // +optional + Allocated corev1.ResourceList `json:"allocated,omitempty"` } type ReservationOwner struct { - // Multiple field selectors are ORed. - Object *corev1.ObjectReference `json:"object,omitempty"` - Controller *ReservationControllerReference `json:"controller,omitempty"` - LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"` + // Multiple field selectors are ANDed. + // +optional + Object *corev1.ObjectReference `json:"object,omitempty"` + // +optional + Controller *ReservationControllerReference `json:"controller,omitempty"` + // +optional + LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"` } type ReservationControllerReference struct { @@ -91,6 +127,11 @@ const ( ReservationExpired ReservationPhase = "Expired" ) +const ( + ReasonReservationAvailable = "Available" + ReasonReservationExpired = "Expired" +) + type ReservationCondition struct { LastProbeTime metav1.Time `json:"lastProbeTime"` LastTransitionTime metav1.Time `json:"lastTransitionTime"` @@ -99,7 +140,6 @@ type ReservationCondition struct { } // +genclient -// +genclient:nonNamespaced // +kubebuilder:resource:scope=Cluster // +kubebuilder:object:root=true // +kubebuilder:subresource:status @@ -109,12 +149,11 @@ type ReservationCondition struct { // +kubebuilder:printcolumn:name="TTL",type="string",JSONPath=".spec.ttl",priority=10 // +kubebuilder:printcolumn:name="Expires",type="string",JSONPath=".spec.expires",priority=10 -// Reservation is the Schema for the reservation API +// Reservation is the Schema for the reservation API. +// A Reservation object is namespaced. But it can reserve resources for pods of any namespace. Any +// namespaced affinity/anti-affinity of reservation scheduling can be specified with the ObjectMeta. type Reservation struct { - metav1.TypeMeta `json:",inline"` - // A Reservation object is non-namespaced. - // It can reserve resources for pods of any namespace. Any affinity/anti-affinity of reservation scheduling can be - // specified in the pod template. + metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` Spec ReservationSpec `json:"spec,omitempty"` diff --git a/apis/scheduling/v1alpha1/zz_generated.deepcopy.go b/apis/scheduling/v1alpha1/zz_generated.deepcopy.go index 13e9e06ca..60eeaab0c 100644 --- a/apis/scheduling/v1alpha1/zz_generated.deepcopy.go +++ b/apis/scheduling/v1alpha1/zz_generated.deepcopy.go @@ -27,6 +27,393 @@ import ( runtime "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Device) DeepCopyInto(out *Device) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Device. +func (in *Device) DeepCopy() *Device { + if in == nil { + return nil + } + out := new(Device) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Device) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeviceAllocation) DeepCopyInto(out *DeviceAllocation) { + *out = *in + if in.Entries != nil { + in, out := &in.Entries, &out.Entries + *out = make([]DeviceAllocationItem, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeviceAllocation. +func (in *DeviceAllocation) DeepCopy() *DeviceAllocation { + if in == nil { + return nil + } + out := new(DeviceAllocation) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeviceAllocationItem) DeepCopyInto(out *DeviceAllocationItem) { + *out = *in + if in.Devices != nil { + in, out := &in.Devices, &out.Devices + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeviceAllocationItem. +func (in *DeviceAllocationItem) DeepCopy() *DeviceAllocationItem { + if in == nil { + return nil + } + out := new(DeviceAllocationItem) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeviceInfo) DeepCopyInto(out *DeviceInfo) { + *out = *in + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = make(v1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeviceInfo. +func (in *DeviceInfo) DeepCopy() *DeviceInfo { + if in == nil { + return nil + } + out := new(DeviceInfo) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeviceList) DeepCopyInto(out *DeviceList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Device, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeviceList. +func (in *DeviceList) DeepCopy() *DeviceList { + if in == nil { + return nil + } + out := new(DeviceList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DeviceList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeviceSpec) DeepCopyInto(out *DeviceSpec) { + *out = *in + if in.Devices != nil { + in, out := &in.Devices, &out.Devices + *out = make([]DeviceInfo, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeviceSpec. +func (in *DeviceSpec) DeepCopy() *DeviceSpec { + if in == nil { + return nil + } + out := new(DeviceSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeviceStatus) DeepCopyInto(out *DeviceStatus) { + *out = *in + if in.Allocations != nil { + in, out := &in.Allocations, &out.Allocations + *out = make([]DeviceAllocation, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeviceStatus. +func (in *DeviceStatus) DeepCopy() *DeviceStatus { + if in == nil { + return nil + } + out := new(DeviceStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodMigrateReservationOptions) DeepCopyInto(out *PodMigrateReservationOptions) { + *out = *in + if in.ReservationRef != nil { + in, out := &in.ReservationRef, &out.ReservationRef + *out = new(v1.ObjectReference) + **out = **in + } + if in.Template != nil { + in, out := &in.Template, &out.Template + *out = new(ReservationTemplateSpec) + (*in).DeepCopyInto(*out) + } + if in.PreemptionOptions != nil { + in, out := &in.PreemptionOptions, &out.PreemptionOptions + *out = new(PodMigrationJobPreemptionOptions) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMigrateReservationOptions. +func (in *PodMigrateReservationOptions) DeepCopy() *PodMigrateReservationOptions { + if in == nil { + return nil + } + out := new(PodMigrateReservationOptions) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodMigrationJob) DeepCopyInto(out *PodMigrationJob) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMigrationJob. +func (in *PodMigrationJob) DeepCopy() *PodMigrationJob { + if in == nil { + return nil + } + out := new(PodMigrationJob) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *PodMigrationJob) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodMigrationJobCondition) DeepCopyInto(out *PodMigrationJobCondition) { + *out = *in + in.LastProbeTime.DeepCopyInto(&out.LastProbeTime) + in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMigrationJobCondition. +func (in *PodMigrationJobCondition) DeepCopy() *PodMigrationJobCondition { + if in == nil { + return nil + } + out := new(PodMigrationJobCondition) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodMigrationJobList) DeepCopyInto(out *PodMigrationJobList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]PodMigrationJob, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMigrationJobList. +func (in *PodMigrationJobList) DeepCopy() *PodMigrationJobList { + if in == nil { + return nil + } + out := new(PodMigrationJobList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *PodMigrationJobList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodMigrationJobPreemptedReservation) DeepCopyInto(out *PodMigrationJobPreemptedReservation) { + *out = *in + if in.PreemptedPodRef != nil { + in, out := &in.PreemptedPodRef, &out.PreemptedPodRef + *out = new(v1.ObjectReference) + **out = **in + } + if in.PodsRef != nil { + in, out := &in.PodsRef, &out.PodsRef + *out = make([]v1.ObjectReference, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMigrationJobPreemptedReservation. +func (in *PodMigrationJobPreemptedReservation) DeepCopy() *PodMigrationJobPreemptedReservation { + if in == nil { + return nil + } + out := new(PodMigrationJobPreemptedReservation) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodMigrationJobPreemptionOptions) DeepCopyInto(out *PodMigrationJobPreemptionOptions) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMigrationJobPreemptionOptions. +func (in *PodMigrationJobPreemptionOptions) DeepCopy() *PodMigrationJobPreemptionOptions { + if in == nil { + return nil + } + out := new(PodMigrationJobPreemptionOptions) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodMigrationJobSpec) DeepCopyInto(out *PodMigrationJobSpec) { + *out = *in + if in.TTL != nil { + in, out := &in.TTL, &out.TTL + *out = new(metav1.Duration) + **out = **in + } + if in.PodRef != nil { + in, out := &in.PodRef, &out.PodRef + *out = new(v1.ObjectReference) + **out = **in + } + if in.ReservationOptions != nil { + in, out := &in.ReservationOptions, &out.ReservationOptions + *out = new(PodMigrateReservationOptions) + (*in).DeepCopyInto(*out) + } + if in.DeleteOptions != nil { + in, out := &in.DeleteOptions, &out.DeleteOptions + *out = new(metav1.DeleteOptions) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMigrationJobSpec. +func (in *PodMigrationJobSpec) DeepCopy() *PodMigrationJobSpec { + if in == nil { + return nil + } + out := new(PodMigrationJobSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodMigrationJobStatus) DeepCopyInto(out *PodMigrationJobStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]PodMigrationJobCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.PodsRef != nil { + in, out := &in.PodsRef, &out.PodsRef + *out = make([]v1.ObjectReference, len(*in)) + copy(*out, *in) + } + if in.PreemptedPodsRef != nil { + in, out := &in.PreemptedPodsRef, &out.PreemptedPodsRef + *out = make([]v1.ObjectReference, len(*in)) + copy(*out, *in) + } + if in.PreemptedPodsReservations != nil { + in, out := &in.PreemptedPodsReservations, &out.PreemptedPodsReservations + *out = make([]PodMigrationJobPreemptedReservation, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMigrationJobStatus. +func (in *PodMigrationJobStatus) DeepCopy() *PodMigrationJobStatus { + if in == nil { + return nil + } + out := new(PodMigrationJobStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Reservation) DeepCopyInto(out *Reservation) { *out = *in @@ -188,6 +575,10 @@ func (in *ReservationSpec) DeepCopy() *ReservationSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ReservationStatus) DeepCopyInto(out *ReservationStatus) { *out = *in + if in.Expired != nil { + in, out := &in.Expired, &out.Expired + *out = (*in).DeepCopy() + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]ReservationCondition, len(*in)) @@ -200,6 +591,20 @@ func (in *ReservationStatus) DeepCopyInto(out *ReservationStatus) { *out = make([]v1.ObjectReference, len(*in)) copy(*out, *in) } + if in.Allocatable != nil { + in, out := &in.Allocatable, &out.Allocatable + *out = make(v1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + if in.Allocated != nil { + in, out := &in.Allocated, &out.Allocated + *out = make(v1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReservationStatus. @@ -211,3 +616,20 @@ func (in *ReservationStatus) DeepCopy() *ReservationStatus { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ReservationTemplateSpec) DeepCopyInto(out *ReservationTemplateSpec) { + *out = *in + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReservationTemplateSpec. +func (in *ReservationTemplateSpec) DeepCopy() *ReservationTemplateSpec { + if in == nil { + return nil + } + out := new(ReservationTemplateSpec) + in.DeepCopyInto(out) + return out +} diff --git a/apis/slo/v1alpha1/nodemetric_types.go b/apis/slo/v1alpha1/nodemetric_types.go index 190f08564..0448be0e5 100644 --- a/apis/slo/v1alpha1/nodemetric_types.go +++ b/apis/slo/v1alpha1/nodemetric_types.go @@ -17,7 +17,6 @@ limitations under the License. package v1alpha1 import ( - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -34,10 +33,6 @@ type PodMetricInfo struct { PodUsage ResourceMap `json:"podUsage,omitempty"` } -type ResourceMap struct { - corev1.ResourceList `json:"resources,omitempty"` -} - // NodeMetricSpec defines the desired state of NodeMetric type NodeMetricSpec struct { // CollectPolicy defines the Metric collection policy diff --git a/apis/slo/v1alpha1/nodeslo_types.go b/apis/slo/v1alpha1/nodeslo_types.go index 20ad00f05..511b92def 100644 --- a/apis/slo/v1alpha1/nodeslo_types.go +++ b/apis/slo/v1alpha1/nodeslo_types.go @@ -24,13 +24,13 @@ import ( // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. // CPUQOS enables cpu qos features. -type CPUQoS struct { +type CPUQOS struct { // group identity value for pods, default = 0 GroupIdentity *int64 `json:"groupIdentity,omitempty"` } -// MemoryQoS enables memory qos features. -type MemoryQoS struct { +// MemoryQOS enables memory qos features. +type MemoryQOS struct { // memcg qos // If enabled, memcg qos will be set by the agent, where some fields are implicitly calculated from pod spec. // 1. `memory.min` := spec.requests.memory * minLimitFactor / 100 (use 0 if requests.memory is not set) @@ -92,60 +92,60 @@ type MemoryQoS struct { OomKillGroup *int64 `json:"oomKillGroup,omitempty"` } -type PodMemoryQoSPolicy string +type PodMemoryQOSPolicy string const ( - // PodMemoryQoSPolicyDefault indicates pod inherits node-level config - PodMemoryQoSPolicyDefault PodMemoryQoSPolicy = "default" - // PodMemoryQoSPolicyNone indicates pod disables memory qos - PodMemoryQoSPolicyNone PodMemoryQoSPolicy = "none" - // PodMemoryQoSPolicyAuto indicates pod uses a recommended config - PodMemoryQoSPolicyAuto PodMemoryQoSPolicy = "auto" + // PodMemoryQOSPolicyDefault indicates pod inherits node-level config + PodMemoryQOSPolicyDefault PodMemoryQOSPolicy = "default" + // PodMemoryQOSPolicyNone indicates pod disables memory qos + PodMemoryQOSPolicyNone PodMemoryQOSPolicy = "none" + // PodMemoryQOSPolicyAuto indicates pod uses a recommended config + PodMemoryQOSPolicyAuto PodMemoryQOSPolicy = "auto" ) -type PodMemoryQoSConfig struct { +type PodMemoryQOSConfig struct { // Policy indicates the qos plan; use "default" if empty - Policy PodMemoryQoSPolicy `json:"policy,omitempty"` - MemoryQoS `json:",inline"` + Policy PodMemoryQOSPolicy `json:"policy,omitempty"` + MemoryQOS `json:",inline"` } -// CPUQoSCfg stores node-level config of cpu qos -type CPUQoSCfg struct { +// CPUQOSCfg stores node-level config of cpu qos +type CPUQOSCfg struct { // Enable indicates whether the cpu qos is enabled. Enable *bool `json:"enable,omitempty"` - CPUQoS `json:",inline"` + CPUQOS `json:",inline"` } -// MemoryQoSCfg stores node-level config of memory qos -type MemoryQoSCfg struct { +// MemoryQOSCfg stores node-level config of memory qos +type MemoryQOSCfg struct { // Enable indicates whether the memory qos is enabled (default: false). - // This field is used for node-level control, while pod-level configuration is done with MemoryQoS and `Policy` - // instead of an `Enable` option. Please view the differences between MemoryQoSCfg and PodMemoryQoSConfig structs. + // This field is used for node-level control, while pod-level configuration is done with MemoryQOS and `Policy` + // instead of an `Enable` option. Please view the differences between MemoryQOSCfg and PodMemoryQOSConfig structs. Enable *bool `json:"enable,omitempty"` - MemoryQoS `json:",inline"` + MemoryQOS `json:",inline"` } -type ResourceQoS struct { - CPUQoS *CPUQoSCfg `json:"cpuQoS,omitempty"` - MemoryQoS *MemoryQoSCfg `json:"memoryQoS,omitempty"` - ResctrlQoS *ResctrlQoSCfg `json:"resctrlQoS,omitempty"` +type ResourceQOS struct { + CPUQOS *CPUQOSCfg `json:"cpuQOS,omitempty"` + MemoryQOS *MemoryQOSCfg `json:"memoryQOS,omitempty"` + ResctrlQOS *ResctrlQOSCfg `json:"resctrlQOS,omitempty"` } -type ResourceQoSStrategy struct { - // ResourceQoS for LSR pods. - LSR *ResourceQoS `json:"lsr,omitempty"` +type ResourceQOSStrategy struct { + // ResourceQOS for LSR pods. + LSRClass *ResourceQOS `json:"lsrClass,omitempty"` - // ResourceQoS for LS pods. - LS *ResourceQoS `json:"ls,omitempty"` + // ResourceQOS for LS pods. + LSClass *ResourceQOS `json:"lsClass,omitempty"` - // ResourceQoS for BE pods. - BE *ResourceQoS `json:"be,omitempty"` + // ResourceQOS for BE pods. + BEClass *ResourceQOS `json:"beClass,omitempty"` - // ResourceQoS for system pods - System *ResourceQoS `json:"system,omitempty"` + // ResourceQOS for system pods + SystemClass *ResourceQOS `json:"systemClass,omitempty"` - // ResourceQoS for root cgroup. - CgroupRoot *ResourceQoS `json:"cgroupRoot,omitempty"` + // ResourceQOS for root cgroup. + CgroupRoot *ResourceQOS `json:"cgroupRoot,omitempty"` } type CPUSuppressPolicy string @@ -186,14 +186,14 @@ type ResourceThresholdStrategy struct { CPUEvictTimeWindowSeconds *int64 `json:"cpuEvictTimeWindowSeconds,omitempty"` } -// ResctrlQoSCfg stores node-level config of resctrl qos -type ResctrlQoSCfg struct { +// ResctrlQOSCfg stores node-level config of resctrl qos +type ResctrlQOSCfg struct { // Enable indicates whether the resctrl qos is enabled. Enable *bool `json:"enable,omitempty"` - ResctrlQoS `json:",inline"` + ResctrlQOS `json:",inline"` } -type ResctrlQoS struct { +type ResctrlQOS struct { // LLC available range start for pods by percentage // +kubebuilder:default=0 // +kubebuilder:validation:Minimum=0 @@ -251,7 +251,7 @@ type NodeSLOSpec struct { // BE pods will be limited if node resource usage overload ResourceUsedThresholdWithBE *ResourceThresholdStrategy `json:"resourceUsedThresholdWithBE,omitempty"` // QoS config strategy for pods of different qos-class - ResourceQoSStrategy *ResourceQoSStrategy `json:"resourceQoSStrategy,omitempty"` + ResourceQOSStrategy *ResourceQOSStrategy `json:"resourceQOSStrategy,omitempty"` // CPU Burst Strategy CPUBurstStrategy *CPUBurstStrategy `json:"cpuBurstStrategy,omitempty"` } diff --git a/apis/slo/v1alpha1/resources.go b/apis/slo/v1alpha1/resources.go new file mode 100644 index 000000000..8e84a8750 --- /dev/null +++ b/apis/slo/v1alpha1/resources.go @@ -0,0 +1,28 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" +) + +type ResourceMap struct { + corev1.ResourceList `json:"resources,omitempty"` + Devices []schedulingv1alpha1.DeviceInfo `json:"devices,omitempty"` +} diff --git a/apis/slo/v1alpha1/zz_generated.deepcopy.go b/apis/slo/v1alpha1/zz_generated.deepcopy.go index 8e71c9f49..5d57933d7 100644 --- a/apis/slo/v1alpha1/zz_generated.deepcopy.go +++ b/apis/slo/v1alpha1/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ limitations under the License. package v1alpha1 import ( + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" "k8s.io/api/core/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -78,7 +79,7 @@ func (in *CPUBurstStrategy) DeepCopy() *CPUBurstStrategy { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *CPUQoS) DeepCopyInto(out *CPUQoS) { +func (in *CPUQOS) DeepCopyInto(out *CPUQOS) { *out = *in if in.GroupIdentity != nil { in, out := &in.GroupIdentity, &out.GroupIdentity @@ -87,39 +88,39 @@ func (in *CPUQoS) DeepCopyInto(out *CPUQoS) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUQoS. -func (in *CPUQoS) DeepCopy() *CPUQoS { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUQOS. +func (in *CPUQOS) DeepCopy() *CPUQOS { if in == nil { return nil } - out := new(CPUQoS) + out := new(CPUQOS) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *CPUQoSCfg) DeepCopyInto(out *CPUQoSCfg) { +func (in *CPUQOSCfg) DeepCopyInto(out *CPUQOSCfg) { *out = *in if in.Enable != nil { in, out := &in.Enable, &out.Enable *out = new(bool) **out = **in } - in.CPUQoS.DeepCopyInto(&out.CPUQoS) + in.CPUQOS.DeepCopyInto(&out.CPUQOS) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUQoSCfg. -func (in *CPUQoSCfg) DeepCopy() *CPUQoSCfg { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUQOSCfg. +func (in *CPUQOSCfg) DeepCopy() *CPUQOSCfg { if in == nil { return nil } - out := new(CPUQoSCfg) + out := new(CPUQOSCfg) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MemoryQoS) DeepCopyInto(out *MemoryQoS) { +func (in *MemoryQOS) DeepCopyInto(out *MemoryQOS) { *out = *in if in.MinLimitPercent != nil { in, out := &in.MinLimitPercent, &out.MinLimitPercent @@ -168,33 +169,33 @@ func (in *MemoryQoS) DeepCopyInto(out *MemoryQoS) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemoryQoS. -func (in *MemoryQoS) DeepCopy() *MemoryQoS { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemoryQOS. +func (in *MemoryQOS) DeepCopy() *MemoryQOS { if in == nil { return nil } - out := new(MemoryQoS) + out := new(MemoryQOS) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MemoryQoSCfg) DeepCopyInto(out *MemoryQoSCfg) { +func (in *MemoryQOSCfg) DeepCopyInto(out *MemoryQOSCfg) { *out = *in if in.Enable != nil { in, out := &in.Enable, &out.Enable *out = new(bool) **out = **in } - in.MemoryQoS.DeepCopyInto(&out.MemoryQoS) + in.MemoryQOS.DeepCopyInto(&out.MemoryQOS) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemoryQoSCfg. -func (in *MemoryQoSCfg) DeepCopy() *MemoryQoSCfg { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemoryQOSCfg. +func (in *MemoryQOSCfg) DeepCopy() *MemoryQOSCfg { if in == nil { return nil } - out := new(MemoryQoSCfg) + out := new(MemoryQOSCfg) in.DeepCopyInto(out) return out } @@ -421,9 +422,9 @@ func (in *NodeSLOSpec) DeepCopyInto(out *NodeSLOSpec) { *out = new(ResourceThresholdStrategy) (*in).DeepCopyInto(*out) } - if in.ResourceQoSStrategy != nil { - in, out := &in.ResourceQoSStrategy, &out.ResourceQoSStrategy - *out = new(ResourceQoSStrategy) + if in.ResourceQOSStrategy != nil { + in, out := &in.ResourceQOSStrategy, &out.ResourceQOSStrategy + *out = new(ResourceQOSStrategy) (*in).DeepCopyInto(*out) } if in.CPUBurstStrategy != nil { @@ -459,17 +460,17 @@ func (in *NodeSLOStatus) DeepCopy() *NodeSLOStatus { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PodMemoryQoSConfig) DeepCopyInto(out *PodMemoryQoSConfig) { +func (in *PodMemoryQOSConfig) DeepCopyInto(out *PodMemoryQOSConfig) { *out = *in - in.MemoryQoS.DeepCopyInto(&out.MemoryQoS) + in.MemoryQOS.DeepCopyInto(&out.MemoryQOS) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMemoryQoSConfig. -func (in *PodMemoryQoSConfig) DeepCopy() *PodMemoryQoSConfig { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodMemoryQOSConfig. +func (in *PodMemoryQOSConfig) DeepCopy() *PodMemoryQOSConfig { if in == nil { return nil } - out := new(PodMemoryQoSConfig) + out := new(PodMemoryQOSConfig) in.DeepCopyInto(out) return out } @@ -491,7 +492,7 @@ func (in *PodMetricInfo) DeepCopy() *PodMetricInfo { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ResctrlQoS) DeepCopyInto(out *ResctrlQoS) { +func (in *ResctrlQOS) DeepCopyInto(out *ResctrlQOS) { *out = *in if in.CATRangeStartPercent != nil { in, out := &in.CATRangeStartPercent, &out.CATRangeStartPercent @@ -510,33 +511,33 @@ func (in *ResctrlQoS) DeepCopyInto(out *ResctrlQoS) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResctrlQoS. -func (in *ResctrlQoS) DeepCopy() *ResctrlQoS { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResctrlQOS. +func (in *ResctrlQOS) DeepCopy() *ResctrlQOS { if in == nil { return nil } - out := new(ResctrlQoS) + out := new(ResctrlQOS) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ResctrlQoSCfg) DeepCopyInto(out *ResctrlQoSCfg) { +func (in *ResctrlQOSCfg) DeepCopyInto(out *ResctrlQOSCfg) { *out = *in if in.Enable != nil { in, out := &in.Enable, &out.Enable *out = new(bool) **out = **in } - in.ResctrlQoS.DeepCopyInto(&out.ResctrlQoS) + in.ResctrlQOS.DeepCopyInto(&out.ResctrlQOS) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResctrlQoSCfg. -func (in *ResctrlQoSCfg) DeepCopy() *ResctrlQoSCfg { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResctrlQOSCfg. +func (in *ResctrlQOSCfg) DeepCopy() *ResctrlQOSCfg { if in == nil { return nil } - out := new(ResctrlQoSCfg) + out := new(ResctrlQOSCfg) in.DeepCopyInto(out) return out } @@ -551,6 +552,13 @@ func (in *ResourceMap) DeepCopyInto(out *ResourceMap) { (*out)[key] = val.DeepCopy() } } + if in.Devices != nil { + in, out := &in.Devices, &out.Devices + *out = make([]schedulingv1alpha1.DeviceInfo, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceMap. @@ -564,71 +572,71 @@ func (in *ResourceMap) DeepCopy() *ResourceMap { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ResourceQoS) DeepCopyInto(out *ResourceQoS) { +func (in *ResourceQOS) DeepCopyInto(out *ResourceQOS) { *out = *in - if in.CPUQoS != nil { - in, out := &in.CPUQoS, &out.CPUQoS - *out = new(CPUQoSCfg) + if in.CPUQOS != nil { + in, out := &in.CPUQOS, &out.CPUQOS + *out = new(CPUQOSCfg) (*in).DeepCopyInto(*out) } - if in.MemoryQoS != nil { - in, out := &in.MemoryQoS, &out.MemoryQoS - *out = new(MemoryQoSCfg) + if in.MemoryQOS != nil { + in, out := &in.MemoryQOS, &out.MemoryQOS + *out = new(MemoryQOSCfg) (*in).DeepCopyInto(*out) } - if in.ResctrlQoS != nil { - in, out := &in.ResctrlQoS, &out.ResctrlQoS - *out = new(ResctrlQoSCfg) + if in.ResctrlQOS != nil { + in, out := &in.ResctrlQOS, &out.ResctrlQOS + *out = new(ResctrlQOSCfg) (*in).DeepCopyInto(*out) } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQoS. -func (in *ResourceQoS) DeepCopy() *ResourceQoS { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQOS. +func (in *ResourceQOS) DeepCopy() *ResourceQOS { if in == nil { return nil } - out := new(ResourceQoS) + out := new(ResourceQOS) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ResourceQoSStrategy) DeepCopyInto(out *ResourceQoSStrategy) { +func (in *ResourceQOSStrategy) DeepCopyInto(out *ResourceQOSStrategy) { *out = *in - if in.LSR != nil { - in, out := &in.LSR, &out.LSR - *out = new(ResourceQoS) + if in.LSRClass != nil { + in, out := &in.LSRClass, &out.LSRClass + *out = new(ResourceQOS) (*in).DeepCopyInto(*out) } - if in.LS != nil { - in, out := &in.LS, &out.LS - *out = new(ResourceQoS) + if in.LSClass != nil { + in, out := &in.LSClass, &out.LSClass + *out = new(ResourceQOS) (*in).DeepCopyInto(*out) } - if in.BE != nil { - in, out := &in.BE, &out.BE - *out = new(ResourceQoS) + if in.BEClass != nil { + in, out := &in.BEClass, &out.BEClass + *out = new(ResourceQOS) (*in).DeepCopyInto(*out) } - if in.System != nil { - in, out := &in.System, &out.System - *out = new(ResourceQoS) + if in.SystemClass != nil { + in, out := &in.SystemClass, &out.SystemClass + *out = new(ResourceQOS) (*in).DeepCopyInto(*out) } if in.CgroupRoot != nil { in, out := &in.CgroupRoot, &out.CgroupRoot - *out = new(ResourceQoS) + *out = new(ResourceQOS) (*in).DeepCopyInto(*out) } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQoSStrategy. -func (in *ResourceQoSStrategy) DeepCopy() *ResourceQoSStrategy { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQOSStrategy. +func (in *ResourceQOSStrategy) DeepCopy() *ResourceQOSStrategy { if in == nil { return nil } - out := new(ResourceQoSStrategy) + out := new(ResourceQOSStrategy) in.DeepCopyInto(out) return out } diff --git a/cmd/koord-scheduler/app/server.go b/cmd/koord-scheduler/app/server.go index 81279d6e6..6cf22927c 100644 --- a/cmd/koord-scheduler/app/server.go +++ b/cmd/koord-scheduler/app/server.go @@ -65,7 +65,7 @@ import ( type Option func(frameworkext.ExtendedHandle, runtime.Registry) error // NewSchedulerCommand creates a *cobra.Command object with default parameters and registryOptions -func NewSchedulerCommand(registryOptions ...Option) *cobra.Command { +func NewSchedulerCommand(schedulingHooks []frameworkext.SchedulingPhaseHook, registryOptions ...Option) *cobra.Command { opts := options.NewOptions() cmd := &cobra.Command{ @@ -78,7 +78,7 @@ scenarios,ensuring the runtime quality of different workloads and users' demands for cost reduction and efficiency enhancement. `, Run: func(cmd *cobra.Command, args []string) { - if err := runCommand(cmd, opts, registryOptions...); err != nil { + if err := runCommand(cmd, opts, schedulingHooks, registryOptions...); err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } @@ -110,7 +110,7 @@ for cost reduction and efficiency enhancement. } // runCommand runs the scheduler. -func runCommand(cmd *cobra.Command, opts *options.Options, registryOptions ...Option) error { +func runCommand(cmd *cobra.Command, opts *options.Options, schedulingHooks []frameworkext.SchedulingPhaseHook, registryOptions ...Option) error { verflag.PrintAndExitIfRequested() cliflag.PrintFlags(cmd.Flags()) @@ -122,7 +122,7 @@ func runCommand(cmd *cobra.Command, opts *options.Options, registryOptions ...Op cancel() }() - cc, sched, err := Setup(ctx, opts, registryOptions...) + cc, sched, err := Setup(ctx, opts, schedulingHooks, registryOptions...) if err != nil { return err } @@ -307,7 +307,7 @@ func WithPlugin(name string, factory runtime.PluginFactory) Option { } // Setup creates a completed config and a scheduler based on the command args and options -func Setup(ctx context.Context, opts *options.Options, outOfTreeRegistryOptions ...Option) (*schedulerserverconfig.CompletedConfig, *scheduler.Scheduler, error) { +func Setup(ctx context.Context, opts *options.Options, schedulingHooks []frameworkext.SchedulingPhaseHook, outOfTreeRegistryOptions ...Option) (*schedulerserverconfig.CompletedConfig, *scheduler.Scheduler, error) { if cfg, err := latest.Default(); err != nil { return nil, nil, err } else { @@ -373,5 +373,12 @@ func Setup(ctx context.Context, opts *options.Options, outOfTreeRegistryOptions // TODO(joseph): Some extensions can also be made in the future, // such as replacing some interfaces in Scheduler to implement custom logic + // extend framework to hook run plugin functions + extendedFrameworkFactory := frameworkext.NewFrameworkExtenderFactory(extendedHandle, schedulingHooks...) + for k, v := range sched.Profiles { + sched.Profiles[k] = extendedFrameworkFactory.New(v) + } + // TODO: register event handlers for scheduler instance + return &cc, sched, nil } diff --git a/cmd/koord-scheduler/main.go b/cmd/koord-scheduler/main.go index 590b3baeb..6fa82ff0d 100644 --- a/cmd/koord-scheduler/main.go +++ b/cmd/koord-scheduler/main.go @@ -24,6 +24,7 @@ import ( "k8s.io/component-base/logs" "github.com/koordinator-sh/koordinator/cmd/koord-scheduler/app" + "github.com/koordinator-sh/koordinator/pkg/scheduler/frameworkext" "github.com/koordinator-sh/koordinator/pkg/scheduler/plugins/compatibledefaultpreemption" "github.com/koordinator-sh/koordinator/pkg/scheduler/plugins/loadaware" "github.com/koordinator-sh/koordinator/pkg/scheduler/plugins/nodenumaresource" @@ -35,10 +36,15 @@ import ( func main() { rand.Seed(time.Now().UnixNano()) + // Register custom scheduling hooks for pre-process scheduling context before call plugins. + // e.g. change the nodeInfo and make a copy before calling filter plugins + var schedulingHooks []frameworkext.SchedulingPhaseHook + // Register custom plugins to the scheduler framework. // Later they can consist of scheduler profile(s) and hence // used by various kinds of workloads. command := app.NewSchedulerCommand( + schedulingHooks, app.WithPlugin(loadaware.Name, loadaware.New), app.WithPlugin(nodenumaresource.Name, nodenumaresource.New), app.WithPlugin(compatibledefaultpreemption.Name, compatibledefaultpreemption.New), diff --git a/config/crd/bases/scheduling.koordinator.sh_devices.yaml b/config/crd/bases/scheduling.koordinator.sh_devices.yaml new file mode 100644 index 000000000..e1b3d3672 --- /dev/null +++ b/config/crd/bases/scheduling.koordinator.sh_devices.yaml @@ -0,0 +1,111 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.8.0 + creationTimestamp: null + name: devices.scheduling.koordinator.sh +spec: + group: scheduling.koordinator.sh + names: + kind: Device + listKind: DeviceList + plural: devices + singular: device + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + devices: + items: + properties: + health: + description: Health indicates whether the device is normal + type: boolean + id: + description: UUID represents the UUID of device + type: string + minor: + description: Minor represents the Minor number of Device, starting + from 0 + format: int32 + type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Resources is a set of (resource name, quantity) + pairs + type: object + type: + description: Type represents the type of device + type: string + type: object + type: array + required: + - devices + type: object + status: + properties: + allocations: + items: + properties: + entries: + items: + properties: + devices: + items: + type: string + type: array + name: + type: string + namespace: + type: string + uuid: + type: string + required: + - devices + - name + - namespace + - uuid + type: object + type: array + type: + type: string + required: + - entries + - type + type: object + type: array + required: + - allocations + type: object + type: object + served: true + storage: true +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/config/crd/bases/scheduling.koordinator.sh_podmigrationjobs.yaml b/config/crd/bases/scheduling.koordinator.sh_podmigrationjobs.yaml new file mode 100644 index 000000000..4ca5d3d24 --- /dev/null +++ b/config/crd/bases/scheduling.koordinator.sh_podmigrationjobs.yaml @@ -0,0 +1,8709 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.8.0 + creationTimestamp: null + name: podmigrationjobs.scheduling.koordinator.sh +spec: + group: scheduling.koordinator.sh + names: + kind: PodMigrationJob + listKind: PodMigrationJobList + plural: podmigrationjobs + singular: podmigrationjob + scope: Cluster + versions: + - additionalPrinterColumns: + - description: The phase of PodMigrationJob + jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .status.nodeName + name: Node + type: string + - jsonPath: .spec.ttl + name: TTL + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + deleteOptions: + description: DeleteOptions defines the deleting options for the migrated + Pod and preempted Pods + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this + representation of an object. Servers should convert recognized + schemas to the latest internal value, and may reject unrecognized + values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + dryRun: + description: 'When present, indicates that modifications should + not be persisted. An invalid or unrecognized dryRun directive + will result in an error response and no further processing of + the request. Valid values are: - All: all dry run stages will + be processed' + items: + type: string + type: array + gracePeriodSeconds: + description: The duration in seconds before the object should + be deleted. Value must be non-negative integer. The value zero + indicates delete immediately. If this value is nil, the default + grace period for the specified type will be used. Defaults to + a per object value if not specified. zero means delete immediately. + format: int64 + type: integer + kind: + description: 'Kind is a string value representing the REST resource + this object represents. Servers may infer this from the endpoint + the client submits requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + orphanDependents: + description: 'Deprecated: please use the PropagationPolicy, this + field will be deprecated in 1.7. Should the dependent objects + be orphaned. If true/false, the "orphan" finalizer will be added + to/removed from the object''s finalizers list. Either this field + or PropagationPolicy may be set, but not both.' + type: boolean + preconditions: + description: Must be fulfilled before a deletion is carried out. + If not possible, a 409 Conflict status will be returned. + properties: + resourceVersion: + description: Specifies the target ResourceVersion + type: string + uid: + description: Specifies the target UID. + type: string + type: object + propagationPolicy: + description: 'Whether and how garbage collection will be performed. + Either this field or OrphanDependents may be set, but not both. + The default policy is decided by the existing finalizer set + in the metadata.finalizers and the resource-specific default + policy. Acceptable values are: ''Orphan'' - orphan the dependents; + ''Background'' - allow the garbage collector to delete the dependents + in the background; ''Foreground'' - a cascading policy that + deletes all dependents in the foreground.' + type: string + type: object + mode: + description: Mode represents the operating mode of the Job Default + is PodMigrationJobModeReservationFirst + type: string + paused: + description: Paused indicates whether the PodMigrationJob should to + work or not. Default is false + type: boolean + podRef: + description: PodRef represents the Pod that be migrated + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: 'If referring to a piece of an object instead of + an entire object, this string should contain a valid JSON/Go + field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within + a pod, this would take on a value like: "spec.containers{name}" + (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" + (container with index 2 in this pod). This syntax is chosen + only to have some well-defined way of referencing a part of + an object. TODO: this design is not final and this field is + subject to change in the future.' + type: string + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + namespace: + description: 'Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/' + type: string + resourceVersion: + description: 'Specific resourceVersion to which this reference + is made, if any. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' + type: string + uid: + description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids' + type: string + type: object + reservationOptions: + description: ReservationOptions defines the Reservation options for + migrated Pod + properties: + preemptionOptions: + description: PreemptionOption decides whether to preempt other + Pods. The preemption is safe and reserves resources for preempted + Pods. + type: object + reservationRef: + description: ReservationRef if specified, PodMigrationJob will + check if the status of Reservation is available. ReservationRef + if not specified, PodMigrationJob controller will create Reservation + by Template, and update the ReservationRef to reference the + Reservation + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: 'If referring to a piece of an object instead + of an entire object, this string should contain a valid + JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within + a pod, this would take on a value like: "spec.containers{name}" + (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" + (container with index 2 in this pod). This syntax is chosen + only to have some well-defined way of referencing a part + of an object. TODO: this design is not final and this field + is subject to change in the future.' + type: string + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + namespace: + description: 'Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/' + type: string + resourceVersion: + description: 'Specific resourceVersion to which this reference + is made, if any. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' + type: string + uid: + description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids' + type: string + type: object + template: + description: Template is the object that describes the Reservation + that will be created if not specified ReservationRef + properties: + metadata: + description: Standard object's metadata. + type: object + spec: + description: Specification of the desired behavior of the + Reservation. + properties: + expires: + description: Expired timestamp when the reservation is + expected to expire. If both `expires` and `ttl` are + set, `expires` is checked first. `expires` and `ttl` + are mutually exclusive. Defaults to being set dynamically + at runtime based on the `ttl`. + format: date-time + type: string + owners: + description: Specify the owners who can allocate the reserved + resources. Multiple owner selectors and ORed. + items: + properties: + controller: + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the + "foregroundDeletion" finalizer, then the owner + cannot be deleted from the key-value store + until this reference is removed. Defaults + to false. To set this field, a user needs + "delete" permission of the owner, otherwise + 422 (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points + to the managing controller. + type: boolean + kind: + description: 'Kind of the referent. More info: + https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: + http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + type: string + uid: + description: 'UID of the referent. More info: + http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: object + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels + and matchExpressions are ANDed. An empty label + selector matches all objects. A null label selector + matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement + is a selector that contains values, a key, + and an operator that relates the key and + values. + properties: + key: + description: key is the label key that + the selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. + If the operator is Exists or DoesNotExist, + the values array must be empty. This + array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is + "In", and the values array contains only "value". + The requirements are ANDed. + type: object + type: object + object: + description: Multiple field selectors are ANDed. + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: 'If referring to a piece of an + object instead of an entire object, this string + should contain a valid JSON/Go field access + statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to + a container within a pod, this would take + on a value like: "spec.containers{name}" (where + "name" refers to the name of the container + that triggered the event) or if no container + name is specified "spec.containers[2]" (container + with index 2 in this pod). This syntax is + chosen only to have some well-defined way + of referencing a part of an object. TODO: + this design is not final and this field is + subject to change in the future.' + type: string + kind: + description: 'Kind of the referent. More info: + https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + namespace: + description: 'Namespace of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/' + type: string + resourceVersion: + description: 'Specific resourceVersion to which + this reference is made, if any. More info: + https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' + type: string + uid: + description: 'UID of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids' + type: string + type: object + type: object + type: array + preAllocation: + description: By default, the resources requirements of + reservation (specified in `template.spec`) is filtered + by whether the node has sufficient free resources (i.e. + ReservationRequest < NodeFree). When `preAllocation` + is set, the scheduler will skip this validation and + allow overcommitment. The scheduled reservation would + be waiting to be available until free resources are + sufficient. + type: boolean + template: + description: Template defines the scheduling requirements + (resources, affinities, images, ...) processed by the + scheduler just like a normal pod. If the `template.spec.nodeName` + is specified, the scheduler will not choose another + node but reserve resources on the specified node. + properties: + metadata: + description: 'Standard object''s metadata. More info: + https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' + type: object + spec: + description: 'Specification of the desired behavior + of the pod. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status' + properties: + activeDeadlineSeconds: + description: Optional duration in seconds the + pod may be active on the node relative to StartTime + before the system will actively try to mark + it failed and kill associated containers. Value + must be a positive integer. + format: int64 + type: integer + affinity: + description: If specified, the pod's scheduling + constraints + properties: + nodeAffinity: + description: Describes node affinity scheduling + rules for the pod. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer + to schedule pods to nodes that satisfy + the affinity expressions specified by + this field, but it may choose a node + that violates one or more of the expressions. + The node that is most preferred is the + one with the greatest sum of weights, + i.e. for each node that meets all of + the scheduling requirements (resource + request, requiredDuringScheduling affinity + expressions, etc.), compute a sum by + iterating through the elements of this + field and adding "weight" to the sum + if the node matches the corresponding + matchExpressions; the node(s) with the + highest sum are the most preferred. + items: + description: An empty preferred scheduling + term matches all objects with implicit + weight 0 (i.e. it's a no-op). A null + preferred scheduling term matches + no objects (i.e. is also a no-op). + properties: + preference: + description: A node selector term, + associated with the corresponding + weight. + properties: + matchExpressions: + description: A list of node + selector requirements by node's + labels. + items: + description: A node selector + requirement is a selector + that contains values, a + key, and an operator that + relates the key and values. + properties: + key: + description: The label + key that the selector + applies to. + type: string + operator: + description: Represents + a key's relationship + to a set of values. + Valid operators are + In, NotIn, Exists, DoesNotExist. + Gt, and Lt. + type: string + values: + description: An array + of string values. If + the operator is In or + NotIn, the values array + must be non-empty. If + the operator is Exists + or DoesNotExist, the + values array must be + empty. If the operator + is Gt or Lt, the values + array must have a single + element, which will + be interpreted as an + integer. This array + is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchFields: + description: A list of node + selector requirements by node's + fields. + items: + description: A node selector + requirement is a selector + that contains values, a + key, and an operator that + relates the key and values. + properties: + key: + description: The label + key that the selector + applies to. + type: string + operator: + description: Represents + a key's relationship + to a set of values. + Valid operators are + In, NotIn, Exists, DoesNotExist. + Gt, and Lt. + type: string + values: + description: An array + of string values. If + the operator is In or + NotIn, the values array + must be non-empty. If + the operator is Exists + or DoesNotExist, the + values array must be + empty. If the operator + is Gt or Lt, the values + array must have a single + element, which will + be interpreted as an + integer. This array + is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + type: object + weight: + description: Weight associated with + matching the corresponding nodeSelectorTerm, + in the range 1-100. + format: int32 + type: integer + required: + - preference + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements + specified by this field are not met + at scheduling time, the pod will not + be scheduled onto the node. If the affinity + requirements specified by this field + cease to be met at some point during + pod execution (e.g. due to an update), + the system may or may not try to eventually + evict the pod from its node. + properties: + nodeSelectorTerms: + description: Required. A list of node + selector terms. The terms are ORed. + items: + description: A null or empty node + selector term matches no objects. + The requirements of them are ANDed. + The TopologySelectorTerm type + implements a subset of the NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node + selector requirements by node's + labels. + items: + description: A node selector + requirement is a selector + that contains values, a + key, and an operator that + relates the key and values. + properties: + key: + description: The label + key that the selector + applies to. + type: string + operator: + description: Represents + a key's relationship + to a set of values. + Valid operators are + In, NotIn, Exists, DoesNotExist. + Gt, and Lt. + type: string + values: + description: An array + of string values. If + the operator is In or + NotIn, the values array + must be non-empty. If + the operator is Exists + or DoesNotExist, the + values array must be + empty. If the operator + is Gt or Lt, the values + array must have a single + element, which will + be interpreted as an + integer. This array + is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchFields: + description: A list of node + selector requirements by node's + fields. + items: + description: A node selector + requirement is a selector + that contains values, a + key, and an operator that + relates the key and values. + properties: + key: + description: The label + key that the selector + applies to. + type: string + operator: + description: Represents + a key's relationship + to a set of values. + Valid operators are + In, NotIn, Exists, DoesNotExist. + Gt, and Lt. + type: string + values: + description: An array + of string values. If + the operator is In or + NotIn, the values array + must be non-empty. If + the operator is Exists + or DoesNotExist, the + values array must be + empty. If the operator + is Gt or Lt, the values + array must have a single + element, which will + be interpreted as an + integer. This array + is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + type: object + type: array + required: + - nodeSelectorTerms + type: object + type: object + podAffinity: + description: Describes pod affinity scheduling + rules (e.g. co-locate this pod in the same + node, zone, etc. as some other pod(s)). + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer + to schedule pods to nodes that satisfy + the affinity expressions specified by + this field, but it may choose a node + that violates one or more of the expressions. + The node that is most preferred is the + one with the greatest sum of weights, + i.e. for each node that meets all of + the scheduling requirements (resource + request, requiredDuringScheduling affinity + expressions, etc.), compute a sum by + iterating through the elements of this + field and adding "weight" to the sum + if the node has pods which matches the + corresponding podAffinityTerm; the node(s) + with the highest sum are the most preferred. + items: + description: The weights of all of the + matched WeightedPodAffinityTerm fields + are added per-node to find the most + preferred node(s) + properties: + podAffinityTerm: + description: Required. A pod affinity + term, associated with the corresponding + weight. + properties: + labelSelector: + description: A label query over + a set of resources, in this + case pods. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, + a key, and an operator + that relates the key + and values. + properties: + key: + description: key is + the label key that + the selector applies + to. + type: string + operator: + description: operator + represents a key's + relationship to + a set of values. + Valid operators + are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values + is an array of string + values. If the operator + is In or NotIn, + the values array + must be non-empty. + If the operator + is Exists or DoesNotExist, + the values array + must be empty. This + array is replaced + during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels + is a map of {key,value} + pairs. A single {key,value} + in the matchLabels map + is equivalent to an element + of matchExpressions, whose + key field is "key", the + operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over + the set of namespaces that + the term applies to. The term + is applied to the union of + the namespaces selected by + this field and the ones listed + in the namespaces field. null + selector and null or empty + namespaces list means "this + pod's namespace". An empty + selector ({}) matches all + namespaces. This field is + beta-level and is only honored + when PodAffinityNamespaceSelector + feature is enabled. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, + a key, and an operator + that relates the key + and values. + properties: + key: + description: key is + the label key that + the selector applies + to. + type: string + operator: + description: operator + represents a key's + relationship to + a set of values. + Valid operators + are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values + is an array of string + values. If the operator + is In or NotIn, + the values array + must be non-empty. + If the operator + is Exists or DoesNotExist, + the values array + must be empty. This + array is replaced + during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels + is a map of {key,value} + pairs. A single {key,value} + in the matchLabels map + is equivalent to an element + of matchExpressions, whose + key field is "key", the + operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies + a static list of namespace + names that the term applies + to. The term is applied to + the union of the namespaces + listed in this field and the + ones selected by namespaceSelector. + null or empty namespaces list + and null namespaceSelector + means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should + be co-located (affinity) or + not co-located (anti-affinity) + with the pods matching the + labelSelector in the specified + namespaces, where co-located + is defined as running on a + node whose value of the label + with key topologyKey matches + that of any node on which + any of the selected pods is + running. Empty topologyKey + is not allowed. + type: string + required: + - topologyKey + type: object + weight: + description: weight associated with + matching the corresponding podAffinityTerm, + in the range 1-100. + format: int32 + type: integer + required: + - podAffinityTerm + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements + specified by this field are not met + at scheduling time, the pod will not + be scheduled onto the node. If the affinity + requirements specified by this field + cease to be met at some point during + pod execution (e.g. due to a pod label + update), the system may or may not try + to eventually evict the pod from its + node. When there are multiple elements, + the lists of nodes corresponding to + each podAffinityTerm are intersected, + i.e. all terms must be satisfied. + items: + description: Defines a set of pods (namely + those matching the labelSelector relative + to the given namespace(s)) that this + pod should be co-located (affinity) + or not co-located (anti-affinity) + with, where co-located is defined + as running on a node whose value of + the label with key matches + that of any node on which a pod of + the set of pods is running + properties: + labelSelector: + description: A label query over + a set of resources, in this case + pods. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, a + key, and an operator that + relates the key and values. + properties: + key: + description: key is the + label key that the selector + applies to. + type: string + operator: + description: operator + represents a key's relationship + to a set of values. + Valid operators are + In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is + an array of string values. + If the operator is In + or NotIn, the values + array must be non-empty. + If the operator is Exists + or DoesNotExist, the + values array must be + empty. This array is + replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is + a map of {key,value} pairs. + A single {key,value} in the + matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", + the operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over + the set of namespaces that the + term applies to. The term is applied + to the union of the namespaces + selected by this field and the + ones listed in the namespaces + field. null selector and null + or empty namespaces list means + "this pod's namespace". An empty + selector ({}) matches all namespaces. + This field is beta-level and is + only honored when PodAffinityNamespaceSelector + feature is enabled. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, a + key, and an operator that + relates the key and values. + properties: + key: + description: key is the + label key that the selector + applies to. + type: string + operator: + description: operator + represents a key's relationship + to a set of values. + Valid operators are + In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is + an array of string values. + If the operator is In + or NotIn, the values + array must be non-empty. + If the operator is Exists + or DoesNotExist, the + values array must be + empty. This array is + replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is + a map of {key,value} pairs. + A single {key,value} in the + matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", + the operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies + a static list of namespace names + that the term applies to. The + term is applied to the union of + the namespaces listed in this + field and the ones selected by + namespaceSelector. null or empty + namespaces list and null namespaceSelector + means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be + co-located (affinity) or not co-located + (anti-affinity) with the pods + matching the labelSelector in + the specified namespaces, where + co-located is defined as running + on a node whose value of the label + with key topologyKey matches that + of any node on which any of the + selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: object + type: array + type: object + podAntiAffinity: + description: Describes pod anti-affinity scheduling + rules (e.g. avoid putting this pod in the + same node, zone, etc. as some other pod(s)). + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer + to schedule pods to nodes that satisfy + the anti-affinity expressions specified + by this field, but it may choose a node + that violates one or more of the expressions. + The node that is most preferred is the + one with the greatest sum of weights, + i.e. for each node that meets all of + the scheduling requirements (resource + request, requiredDuringScheduling anti-affinity + expressions, etc.), compute a sum by + iterating through the elements of this + field and adding "weight" to the sum + if the node has pods which matches the + corresponding podAffinityTerm; the node(s) + with the highest sum are the most preferred. + items: + description: The weights of all of the + matched WeightedPodAffinityTerm fields + are added per-node to find the most + preferred node(s) + properties: + podAffinityTerm: + description: Required. A pod affinity + term, associated with the corresponding + weight. + properties: + labelSelector: + description: A label query over + a set of resources, in this + case pods. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, + a key, and an operator + that relates the key + and values. + properties: + key: + description: key is + the label key that + the selector applies + to. + type: string + operator: + description: operator + represents a key's + relationship to + a set of values. + Valid operators + are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values + is an array of string + values. If the operator + is In or NotIn, + the values array + must be non-empty. + If the operator + is Exists or DoesNotExist, + the values array + must be empty. This + array is replaced + during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels + is a map of {key,value} + pairs. A single {key,value} + in the matchLabels map + is equivalent to an element + of matchExpressions, whose + key field is "key", the + operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over + the set of namespaces that + the term applies to. The term + is applied to the union of + the namespaces selected by + this field and the ones listed + in the namespaces field. null + selector and null or empty + namespaces list means "this + pod's namespace". An empty + selector ({}) matches all + namespaces. This field is + beta-level and is only honored + when PodAffinityNamespaceSelector + feature is enabled. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, + a key, and an operator + that relates the key + and values. + properties: + key: + description: key is + the label key that + the selector applies + to. + type: string + operator: + description: operator + represents a key's + relationship to + a set of values. + Valid operators + are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values + is an array of string + values. If the operator + is In or NotIn, + the values array + must be non-empty. + If the operator + is Exists or DoesNotExist, + the values array + must be empty. This + array is replaced + during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels + is a map of {key,value} + pairs. A single {key,value} + in the matchLabels map + is equivalent to an element + of matchExpressions, whose + key field is "key", the + operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies + a static list of namespace + names that the term applies + to. The term is applied to + the union of the namespaces + listed in this field and the + ones selected by namespaceSelector. + null or empty namespaces list + and null namespaceSelector + means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should + be co-located (affinity) or + not co-located (anti-affinity) + with the pods matching the + labelSelector in the specified + namespaces, where co-located + is defined as running on a + node whose value of the label + with key topologyKey matches + that of any node on which + any of the selected pods is + running. Empty topologyKey + is not allowed. + type: string + required: + - topologyKey + type: object + weight: + description: weight associated with + matching the corresponding podAffinityTerm, + in the range 1-100. + format: int32 + type: integer + required: + - podAffinityTerm + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the anti-affinity requirements + specified by this field are not met + at scheduling time, the pod will not + be scheduled onto the node. If the anti-affinity + requirements specified by this field + cease to be met at some point during + pod execution (e.g. due to a pod label + update), the system may or may not try + to eventually evict the pod from its + node. When there are multiple elements, + the lists of nodes corresponding to + each podAffinityTerm are intersected, + i.e. all terms must be satisfied. + items: + description: Defines a set of pods (namely + those matching the labelSelector relative + to the given namespace(s)) that this + pod should be co-located (affinity) + or not co-located (anti-affinity) + with, where co-located is defined + as running on a node whose value of + the label with key matches + that of any node on which a pod of + the set of pods is running + properties: + labelSelector: + description: A label query over + a set of resources, in this case + pods. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, a + key, and an operator that + relates the key and values. + properties: + key: + description: key is the + label key that the selector + applies to. + type: string + operator: + description: operator + represents a key's relationship + to a set of values. + Valid operators are + In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is + an array of string values. + If the operator is In + or NotIn, the values + array must be non-empty. + If the operator is Exists + or DoesNotExist, the + values array must be + empty. This array is + replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is + a map of {key,value} pairs. + A single {key,value} in the + matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", + the operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over + the set of namespaces that the + term applies to. The term is applied + to the union of the namespaces + selected by this field and the + ones listed in the namespaces + field. null selector and null + or empty namespaces list means + "this pod's namespace". An empty + selector ({}) matches all namespaces. + This field is beta-level and is + only honored when PodAffinityNamespaceSelector + feature is enabled. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, a + key, and an operator that + relates the key and values. + properties: + key: + description: key is the + label key that the selector + applies to. + type: string + operator: + description: operator + represents a key's relationship + to a set of values. + Valid operators are + In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is + an array of string values. + If the operator is In + or NotIn, the values + array must be non-empty. + If the operator is Exists + or DoesNotExist, the + values array must be + empty. This array is + replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is + a map of {key,value} pairs. + A single {key,value} in the + matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", + the operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies + a static list of namespace names + that the term applies to. The + term is applied to the union of + the namespaces listed in this + field and the ones selected by + namespaceSelector. null or empty + namespaces list and null namespaceSelector + means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be + co-located (affinity) or not co-located + (anti-affinity) with the pods + matching the labelSelector in + the specified namespaces, where + co-located is defined as running + on a node whose value of the label + with key topologyKey matches that + of any node on which any of the + selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: object + type: array + type: object + type: object + automountServiceAccountToken: + description: AutomountServiceAccountToken indicates + whether a service account token should be automatically + mounted. + type: boolean + containers: + description: List of containers belonging to the + pod. Containers cannot currently be added or + removed. There must be at least one container + in a Pod. Cannot be updated. + items: + description: A single application container + that you want to run within a pod. + properties: + args: + description: 'Arguments to the entrypoint. + The docker image''s CMD is used if this + is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. + If a variable cannot be resolved, the + reference in the input string will be + unchanged. Double $$ are reduced to a + single $, which allows for escaping the + $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, + regardless of whether the variable exists + or not. Cannot be updated. More info: + https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed + within a shell. The docker image''s ENTRYPOINT + is used if this is not provided. Variable + references $(VAR_NAME) are expanded using + the container''s environment. If a variable + cannot be resolved, the reference in the + input string will be unchanged. Double + $$ are reduced to a single $, which allows + for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string + literal "$(VAR_NAME)". Escaped references + will never be expanded, regardless of + whether the variable exists or not. Cannot + be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables + to set in the container. Cannot be updated. + items: + description: EnvVar represents an environment + variable present in a Container. + properties: + name: + description: Name of the environment + variable. Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references + $(VAR_NAME) are expanded using the + previously defined environment variables + in the container and any service + environment variables. If a variable + cannot be resolved, the reference + in the input string will be unchanged. + Double $$ are reduced to a single + $, which allows for escaping the + $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal + "$(VAR_NAME)". Escaped references + will never be expanded, regardless + of whether the variable exists or + not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment + variable's value. Cannot be used + if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of + a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the + referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether + the ConfigMap or its key + must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field + of the pod: supports metadata.name, + metadata.namespace, `metadata.labels['''']`, + `metadata.annotations['''']`, + spec.nodeName, spec.serviceAccountName, + status.hostIP, status.podIP, + status.podIPs.' + properties: + apiVersion: + description: Version of the + schema the FieldPath is + written in terms of, defaults + to "v1". + type: string + fieldPath: + description: Path of the field + to select in the specified + API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource + of the container: only resources + limits and requests (limits.cpu, + limits.memory, limits.ephemeral-storage, + requests.cpu, requests.memory + and requests.ephemeral-storage) + are currently supported.' + properties: + containerName: + description: 'Container name: + required for volumes, optional + for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the + output format of the exposed + resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource + to select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of + a secret in the pod's namespace + properties: + key: + description: The key of the + secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the + referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether + the Secret or its key must + be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate + environment variables in the container. + The keys defined within a source must + be a C_IDENTIFIER. All invalid keys will + be reported as an event when the container + is starting. When a key exists in multiple + sources, the value associated with the + last source will take precedence. Values + defined by an Env with a duplicate key + will take precedence. Cannot be updated. + items: + description: EnvFromSource represents + the source of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select + from + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the + ConfigMap must be defined + type: boolean + type: object + prefix: + description: An optional identifier + to prepend to each key in the ConfigMap. + Must be a C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select + from + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the + Secret must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: + https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher + level config management to default or + override container images in workload + controllers like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of + Always, Never, IfNotPresent. Defaults + to Always if :latest tag is specified, + or IfNotPresent otherwise. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Actions that the management + system should take in response to container + lifecycle events. Cannot be updated. + properties: + postStart: + description: 'PostStart is called immediately + after a container is created. If the + handler fails, the container is terminated + and restarted according to its restart + policy. Other management of the container + blocks until the hook completes. More + info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of + the following should be specified. + Exec specifies the action to take. + properties: + command: + description: Command is the + command line to execute inside + the container, the working + directory for the command is + root ('/') in the container's + filesystem. The command is + simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, + you need to explicitly call + out to that shell. Exit status + of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the + http request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. + You probably want to set "Host" + in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers + to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes + a custom header to be used + in HTTP probes + properties: + name: + description: The header + field name + type: string + value: + description: The header + field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access + on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for + connecting to the host. Defaults + to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies + an action involving a TCP port. + TCP hooks not yet supported TODO: + implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host + name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately + before a container is terminated due + to an API request or management event + such as liveness/startup probe failure, + preemption, resource contention, etc. + The handler is not called if the container + crashes or exits. The reason for termination + is passed to the handler. The Pod''s + termination grace period countdown + begins before the PreStop hooked is + executed. Regardless of the outcome + of the handler, the container will + eventually terminate within the Pod''s + termination grace period. Other management + of the container blocks until the + hook completes or until the termination + grace period is reached. More info: + https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of + the following should be specified. + Exec specifies the action to take. + properties: + command: + description: Command is the + command line to execute inside + the container, the working + directory for the command is + root ('/') in the container's + filesystem. The command is + simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, + you need to explicitly call + out to that shell. Exit status + of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the + http request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. + You probably want to set "Host" + in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers + to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes + a custom header to be used + in HTTP probes + properties: + name: + description: The header + field name + type: string + value: + description: The header + field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access + on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for + connecting to the host. Defaults + to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies + an action involving a TCP port. + TCP hooks not yet supported TODO: + implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host + name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: 'Periodic probe of container + liveness. Container will be restarted + if the probe fails. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the container specified + as a DNS_LABEL. Each container in a pod + must have a unique name (DNS_LABEL). Cannot + be updated. + type: string + ports: + description: List of ports to expose from + the container. Exposing a port here gives + the system additional information about + the network connections a container uses, + but is primarily informational. Not specifying + a port here DOES NOT prevent that port + from being exposed. Any port which is + listening on the default "0.0.0.0" address + inside a container will be accessible + from the network. Cannot be updated. + items: + description: ContainerPort represents + a network port in a single container. + properties: + containerPort: + description: Number of port to expose + on the pod's IP address. This must + be a valid port number, 0 < x < + 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind + the external port to. + type: string + hostPort: + description: Number of port to expose + on the host. If specified, this + must be a valid port number, 0 < + x < 65536. If HostNetwork is specified, + this must match ContainerPort. Most + containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must + be an IANA_SVC_NAME and unique within + the pod. Each named port in a pod + must have a unique name. Name for + the port that can be referred to + by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must + be UDP, TCP, or SCTP. Defaults to + "TCP". + type: string + required: + - containerPort + type: object + type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map + readinessProbe: + description: 'Periodic probe of container + service readiness. Container will be removed + from service endpoints if the probe fails. + Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: 'Compute Resources required + by this container. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum + amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the + minimum amount of compute resources + required. If Requests is omitted for + a container, it defaults to Limits + if that is explicitly specified, otherwise + to an implementation-defined value. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'SecurityContext defines the + security options the container should + be run with. If set, the fields of SecurityContext + override the equivalent fields of PodSecurityContext. + More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation + controls whether a process can gain + more privileges than its parent process. + This bool directly controls if the + no_new_privs flag will be set on the + container process. AllowPrivilegeEscalation + is true always when the container + is: 1) run as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop + when running containers. Defaults + to the default set of capabilities + granted by the container runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent + POSIX capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent + POSIX capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged + mode. Processes in privileged containers + are essentially equivalent to root + on the host. Defaults to false. + type: boolean + procMount: + description: procMount denotes the type + of proc mount to use for the containers. + The default is DefaultProcMount which + uses the container runtime defaults + for readonly paths and masked paths. + This requires the ProcMountType feature + flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container + has a read-only root filesystem. Default + is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint + of the container process. Uses runtime + default if unset. May also be set + in PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container + must run as a non-root user. If true, + the Kubelet will validate the image + at runtime to ensure that it does + not run as UID 0 (root) and fail to + start the container if it does. If + unset or false, no such validation + will be performed. May also be set + in PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint + of the container process. Defaults + to user specified in image metadata + if unspecified. May also be set in + PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to + be applied to the container. If unspecified, + the container runtime will allocate + a random SELinux context for each + container. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + properties: + level: + description: Level is SELinux level + label that applies to the container. + type: string + role: + description: Role is a SELinux role + label that applies to the container. + type: string + type: + description: Type is a SELinux type + label that applies to the container. + type: string + user: + description: User is a SELinux user + label that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to + use by this container. If seccomp + options are provided at both the pod + & container level, the container options + override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates + a profile defined in a file on + the node should be used. The profile + must be preconfigured on the node + to work. Must be a descending + path, relative to the kubelet's + configured seccomp profile location. + Must only be set if type is "Localhost". + type: string + type: + description: "type indicates which + kind of seccomp profile will be + applied. Valid options are: \n + Localhost - a profile defined + in a file on the node should be + used. RuntimeDefault - the container + runtime default profile should + be used. Unconfined - no profile + should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings + applied to all containers. If unspecified, + the options from the PodSecurityContext + will be used. If set in both SecurityContext + and PodSecurityContext, the value + specified in SecurityContext takes + precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec + is where the GMSA admission webhook + (https://github.com/kubernetes-sigs/windows-gmsa) + inlines the contents of the GMSA + credential spec named by the GMSACredentialSpecName + field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName + is the name of the GMSA credential + spec to use. + type: string + hostProcess: + description: HostProcess determines + if a container should be run as + a 'Host Process' container. This + field is alpha-level and will + only be honored by components + that enable the WindowsHostProcessContainers + feature flag. Setting this field + without the feature flag will + result in errors when validating + the Pod. All of a Pod's containers + must have the same effective HostProcess + value (it is not allowed to have + a mix of HostProcess containers + and non-HostProcess containers). In + addition, if HostProcess is true + then HostNetwork must also be + set to true. + type: boolean + runAsUserName: + description: The UserName in Windows + to run the entrypoint of the container + process. Defaults to the user + specified in image metadata if + unspecified. May also be set in + PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: string + type: object + type: object + startupProbe: + description: 'StartupProbe indicates that + the Pod has successfully initialized. + If specified, no other probes are executed + until this completes successfully. If + this probe fails, the Pod will be restarted, + just as if the livenessProbe failed. This + can be used to provide different probe + parameters at the beginning of a Pod''s + lifecycle, when it might take a long time + to load data or warm a cache, than during + steady-state operation. This cannot be + updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should + allocate a buffer for stdin in the container + runtime. If this is not set, reads from + stdin in the container will always result + in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime + should close the stdin channel after it + has been opened by a single attach. When + stdin is true the stdin stream will remain + open across multiple attach sessions. + If stdinOnce is set to true, stdin is + opened on container start, is empty until + the first client attaches to stdin, and + then remains open and accepts data until + the client disconnects, at which time + stdin is closed and remains closed until + the container is restarted. If this flag + is false, a container processes that reads + from stdin will never receive an EOF. + Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the + file to which the container''s termination + message will be written is mounted into + the container''s filesystem. Message written + is intended to be brief final status, + such as an assertion failure message. + Will be truncated by the node if greater + than 4096 bytes. The total message length + across all containers will be limited + to 12kb. Defaults to /dev/termination-log. + Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination + message should be populated. File will + use the contents of terminationMessagePath + to populate the container status message + on both success and failure. FallbackToLogsOnError + will use the last chunk of container log + output if the termination message file + is empty and the container exited with + an error. The log output is limited to + 2048 bytes or 80 lines, whichever is smaller. + Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should + allocate a TTY for itself, also requires + 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of + block devices to be used by the container. + items: + description: volumeDevice describes a + mapping of a raw block device within + a container. + properties: + devicePath: + description: devicePath is the path + inside of the container that the + device will be mapped to. + type: string + name: + description: name must match the name + of a persistentVolumeClaim in the + pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the + container's filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting + of a Volume within a container. + properties: + mountPath: + description: Path within the container + at which the volume should be mounted. Must + not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines + how mounts are propagated from the + host to container and the other + way around. When not set, MountPropagationNone + is used. This field is beta in 1.10. + type: string + name: + description: This must match the Name + of a Volume. + type: string + readOnly: + description: Mounted read-only if + true, read-write otherwise (false + or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume + from which the container's volume + should be mounted. Defaults to "" + (volume's root). + type: string + subPathExpr: + description: Expanded path within + the volume from which the container's + volume should be mounted. Behaves + similarly to SubPath but environment + variable references $(VAR_NAME) + are expanded using the container's + environment. Defaults to "" (volume's + root). SubPathExpr and SubPath are + mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. + If not specified, the container runtime's + default will be used, which might be configured + in the container image. Cannot be updated. + type: string + required: + - name + type: object + type: array + dnsConfig: + description: Specifies the DNS parameters of a + pod. Parameters specified here will be merged + to the generated DNS configuration based on + DNSPolicy. + properties: + nameservers: + description: A list of DNS name server IP + addresses. This will be appended to the + base nameservers generated from DNSPolicy. + Duplicated nameservers will be removed. + items: + type: string + type: array + options: + description: A list of DNS resolver options. + This will be merged with the base options + generated from DNSPolicy. Duplicated entries + will be removed. Resolution options given + in Options will override those that appear + in the base DNSPolicy. + items: + description: PodDNSConfigOption defines + DNS resolver options of a pod. + properties: + name: + description: Required. + type: string + value: + type: string + type: object + type: array + searches: + description: A list of DNS search domains + for host-name lookup. This will be appended + to the base search paths generated from + DNSPolicy. Duplicated search paths will + be removed. + items: + type: string + type: array + type: object + dnsPolicy: + description: Set DNS policy for the pod. Defaults + to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', + 'ClusterFirst', 'Default' or 'None'. DNS parameters + given in DNSConfig will be merged with the policy + selected with DNSPolicy. To have DNS options + set along with hostNetwork, you have to specify + DNS policy explicitly to 'ClusterFirstWithHostNet'. + type: string + enableServiceLinks: + description: 'EnableServiceLinks indicates whether + information about services should be injected + into pod''s environment variables, matching + the syntax of Docker links. Optional: Defaults + to true.' + type: boolean + ephemeralContainers: + description: List of ephemeral containers run + in this pod. Ephemeral containers may be run + in an existing pod to perform user-initiated + actions such as debugging. This list cannot + be specified when creating a pod, and it cannot + be modified by updating the pod spec. In order + to add an ephemeral container to an existing + pod, use the pod's ephemeralcontainers subresource. + This field is alpha-level and is only honored + by servers that enable the EphemeralContainers + feature. + items: + description: An EphemeralContainer is a container + that may be added temporarily to an existing + pod for user-initiated activities such as + debugging. Ephemeral containers have no resource + or scheduling guarantees, and they will not + be restarted when they exit or when a pod + is removed or restarted. If an ephemeral container + causes a pod to exceed its resource allocation, + the pod may be evicted. Ephemeral containers + may not be added by directly updating the + pod spec. They must be added via the pod's + ephemeralcontainers subresource, and they + will appear in the pod spec once added. This + is an alpha feature enabled by the EphemeralContainers + feature flag. + properties: + args: + description: 'Arguments to the entrypoint. + The docker image''s CMD is used if this + is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. + If a variable cannot be resolved, the + reference in the input string will be + unchanged. Double $$ are reduced to a + single $, which allows for escaping the + $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, + regardless of whether the variable exists + or not. Cannot be updated. More info: + https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed + within a shell. The docker image''s ENTRYPOINT + is used if this is not provided. Variable + references $(VAR_NAME) are expanded using + the container''s environment. If a variable + cannot be resolved, the reference in the + input string will be unchanged. Double + $$ are reduced to a single $, which allows + for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string + literal "$(VAR_NAME)". Escaped references + will never be expanded, regardless of + whether the variable exists or not. Cannot + be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables + to set in the container. Cannot be updated. + items: + description: EnvVar represents an environment + variable present in a Container. + properties: + name: + description: Name of the environment + variable. Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references + $(VAR_NAME) are expanded using the + previously defined environment variables + in the container and any service + environment variables. If a variable + cannot be resolved, the reference + in the input string will be unchanged. + Double $$ are reduced to a single + $, which allows for escaping the + $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal + "$(VAR_NAME)". Escaped references + will never be expanded, regardless + of whether the variable exists or + not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment + variable's value. Cannot be used + if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of + a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the + referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether + the ConfigMap or its key + must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field + of the pod: supports metadata.name, + metadata.namespace, `metadata.labels['''']`, + `metadata.annotations['''']`, + spec.nodeName, spec.serviceAccountName, + status.hostIP, status.podIP, + status.podIPs.' + properties: + apiVersion: + description: Version of the + schema the FieldPath is + written in terms of, defaults + to "v1". + type: string + fieldPath: + description: Path of the field + to select in the specified + API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource + of the container: only resources + limits and requests (limits.cpu, + limits.memory, limits.ephemeral-storage, + requests.cpu, requests.memory + and requests.ephemeral-storage) + are currently supported.' + properties: + containerName: + description: 'Container name: + required for volumes, optional + for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the + output format of the exposed + resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource + to select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of + a secret in the pod's namespace + properties: + key: + description: The key of the + secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the + referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether + the Secret or its key must + be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate + environment variables in the container. + The keys defined within a source must + be a C_IDENTIFIER. All invalid keys will + be reported as an event when the container + is starting. When a key exists in multiple + sources, the value associated with the + last source will take precedence. Values + defined by an Env with a duplicate key + will take precedence. Cannot be updated. + items: + description: EnvFromSource represents + the source of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select + from + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the + ConfigMap must be defined + type: boolean + type: object + prefix: + description: An optional identifier + to prepend to each key in the ConfigMap. + Must be a C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select + from + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the + Secret must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: + https://kubernetes.io/docs/concepts/containers/images' + type: string + imagePullPolicy: + description: 'Image pull policy. One of + Always, Never, IfNotPresent. Defaults + to Always if :latest tag is specified, + or IfNotPresent otherwise. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Lifecycle is not allowed for + ephemeral containers. + properties: + postStart: + description: 'PostStart is called immediately + after a container is created. If the + handler fails, the container is terminated + and restarted according to its restart + policy. Other management of the container + blocks until the hook completes. More + info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of + the following should be specified. + Exec specifies the action to take. + properties: + command: + description: Command is the + command line to execute inside + the container, the working + directory for the command is + root ('/') in the container's + filesystem. The command is + simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, + you need to explicitly call + out to that shell. Exit status + of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the + http request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. + You probably want to set "Host" + in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers + to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes + a custom header to be used + in HTTP probes + properties: + name: + description: The header + field name + type: string + value: + description: The header + field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access + on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for + connecting to the host. Defaults + to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies + an action involving a TCP port. + TCP hooks not yet supported TODO: + implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host + name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately + before a container is terminated due + to an API request or management event + such as liveness/startup probe failure, + preemption, resource contention, etc. + The handler is not called if the container + crashes or exits. The reason for termination + is passed to the handler. The Pod''s + termination grace period countdown + begins before the PreStop hooked is + executed. Regardless of the outcome + of the handler, the container will + eventually terminate within the Pod''s + termination grace period. Other management + of the container blocks until the + hook completes or until the termination + grace period is reached. More info: + https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of + the following should be specified. + Exec specifies the action to take. + properties: + command: + description: Command is the + command line to execute inside + the container, the working + directory for the command is + root ('/') in the container's + filesystem. The command is + simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, + you need to explicitly call + out to that shell. Exit status + of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the + http request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. + You probably want to set "Host" + in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers + to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes + a custom header to be used + in HTTP probes + properties: + name: + description: The header + field name + type: string + value: + description: The header + field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access + on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for + connecting to the host. Defaults + to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies + an action involving a TCP port. + TCP hooks not yet supported TODO: + implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host + name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: Probes are not allowed for + ephemeral containers. + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the ephemeral container + specified as a DNS_LABEL. This name must + be unique among all containers, init containers + and ephemeral containers. + type: string + ports: + description: Ports are not allowed for ephemeral + containers. + items: + description: ContainerPort represents + a network port in a single container. + properties: + containerPort: + description: Number of port to expose + on the pod's IP address. This must + be a valid port number, 0 < x < + 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind + the external port to. + type: string + hostPort: + description: Number of port to expose + on the host. If specified, this + must be a valid port number, 0 < + x < 65536. If HostNetwork is specified, + this must match ContainerPort. Most + containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must + be an IANA_SVC_NAME and unique within + the pod. Each named port in a pod + must have a unique name. Name for + the port that can be referred to + by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must + be UDP, TCP, or SCTP. Defaults to + "TCP". + type: string + required: + - containerPort + type: object + type: array + readinessProbe: + description: Probes are not allowed for + ephemeral containers. + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: Resources are not allowed for + ephemeral containers. Ephemeral containers + use spare resources already allocated + to the pod. + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum + amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the + minimum amount of compute resources + required. If Requests is omitted for + a container, it defaults to Limits + if that is explicitly specified, otherwise + to an implementation-defined value. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'Optional: SecurityContext + defines the security options the ephemeral + container should be run with. If set, + the fields of SecurityContext override + the equivalent fields of PodSecurityContext.' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation + controls whether a process can gain + more privileges than its parent process. + This bool directly controls if the + no_new_privs flag will be set on the + container process. AllowPrivilegeEscalation + is true always when the container + is: 1) run as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop + when running containers. Defaults + to the default set of capabilities + granted by the container runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent + POSIX capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent + POSIX capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged + mode. Processes in privileged containers + are essentially equivalent to root + on the host. Defaults to false. + type: boolean + procMount: + description: procMount denotes the type + of proc mount to use for the containers. + The default is DefaultProcMount which + uses the container runtime defaults + for readonly paths and masked paths. + This requires the ProcMountType feature + flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container + has a read-only root filesystem. Default + is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint + of the container process. Uses runtime + default if unset. May also be set + in PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container + must run as a non-root user. If true, + the Kubelet will validate the image + at runtime to ensure that it does + not run as UID 0 (root) and fail to + start the container if it does. If + unset or false, no such validation + will be performed. May also be set + in PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint + of the container process. Defaults + to user specified in image metadata + if unspecified. May also be set in + PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to + be applied to the container. If unspecified, + the container runtime will allocate + a random SELinux context for each + container. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + properties: + level: + description: Level is SELinux level + label that applies to the container. + type: string + role: + description: Role is a SELinux role + label that applies to the container. + type: string + type: + description: Type is a SELinux type + label that applies to the container. + type: string + user: + description: User is a SELinux user + label that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to + use by this container. If seccomp + options are provided at both the pod + & container level, the container options + override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates + a profile defined in a file on + the node should be used. The profile + must be preconfigured on the node + to work. Must be a descending + path, relative to the kubelet's + configured seccomp profile location. + Must only be set if type is "Localhost". + type: string + type: + description: "type indicates which + kind of seccomp profile will be + applied. Valid options are: \n + Localhost - a profile defined + in a file on the node should be + used. RuntimeDefault - the container + runtime default profile should + be used. Unconfined - no profile + should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings + applied to all containers. If unspecified, + the options from the PodSecurityContext + will be used. If set in both SecurityContext + and PodSecurityContext, the value + specified in SecurityContext takes + precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec + is where the GMSA admission webhook + (https://github.com/kubernetes-sigs/windows-gmsa) + inlines the contents of the GMSA + credential spec named by the GMSACredentialSpecName + field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName + is the name of the GMSA credential + spec to use. + type: string + hostProcess: + description: HostProcess determines + if a container should be run as + a 'Host Process' container. This + field is alpha-level and will + only be honored by components + that enable the WindowsHostProcessContainers + feature flag. Setting this field + without the feature flag will + result in errors when validating + the Pod. All of a Pod's containers + must have the same effective HostProcess + value (it is not allowed to have + a mix of HostProcess containers + and non-HostProcess containers). In + addition, if HostProcess is true + then HostNetwork must also be + set to true. + type: boolean + runAsUserName: + description: The UserName in Windows + to run the entrypoint of the container + process. Defaults to the user + specified in image metadata if + unspecified. May also be set in + PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: string + type: object + type: object + startupProbe: + description: Probes are not allowed for + ephemeral containers. + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should + allocate a buffer for stdin in the container + runtime. If this is not set, reads from + stdin in the container will always result + in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime + should close the stdin channel after it + has been opened by a single attach. When + stdin is true the stdin stream will remain + open across multiple attach sessions. + If stdinOnce is set to true, stdin is + opened on container start, is empty until + the first client attaches to stdin, and + then remains open and accepts data until + the client disconnects, at which time + stdin is closed and remains closed until + the container is restarted. If this flag + is false, a container processes that reads + from stdin will never receive an EOF. + Default is false + type: boolean + targetContainerName: + description: If set, the name of the container + from PodSpec that this ephemeral container + targets. The ephemeral container will + be run in the namespaces (IPC, PID, etc) + of this container. If not set then the + ephemeral container is run in whatever + namespaces are shared for the pod. Note + that the container runtime must support + this feature. + type: string + terminationMessagePath: + description: 'Optional: Path at which the + file to which the container''s termination + message will be written is mounted into + the container''s filesystem. Message written + is intended to be brief final status, + such as an assertion failure message. + Will be truncated by the node if greater + than 4096 bytes. The total message length + across all containers will be limited + to 12kb. Defaults to /dev/termination-log. + Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination + message should be populated. File will + use the contents of terminationMessagePath + to populate the container status message + on both success and failure. FallbackToLogsOnError + will use the last chunk of container log + output if the termination message file + is empty and the container exited with + an error. The log output is limited to + 2048 bytes or 80 lines, whichever is smaller. + Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should + allocate a TTY for itself, also requires + 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of + block devices to be used by the container. + items: + description: volumeDevice describes a + mapping of a raw block device within + a container. + properties: + devicePath: + description: devicePath is the path + inside of the container that the + device will be mapped to. + type: string + name: + description: name must match the name + of a persistentVolumeClaim in the + pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the + container's filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting + of a Volume within a container. + properties: + mountPath: + description: Path within the container + at which the volume should be mounted. Must + not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines + how mounts are propagated from the + host to container and the other + way around. When not set, MountPropagationNone + is used. This field is beta in 1.10. + type: string + name: + description: This must match the Name + of a Volume. + type: string + readOnly: + description: Mounted read-only if + true, read-write otherwise (false + or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume + from which the container's volume + should be mounted. Defaults to "" + (volume's root). + type: string + subPathExpr: + description: Expanded path within + the volume from which the container's + volume should be mounted. Behaves + similarly to SubPath but environment + variable references $(VAR_NAME) + are expanded using the container's + environment. Defaults to "" (volume's + root). SubPathExpr and SubPath are + mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. + If not specified, the container runtime's + default will be used, which might be configured + in the container image. Cannot be updated. + type: string + required: + - name + type: object + type: array + hostAliases: + description: HostAliases is an optional list of + hosts and IPs that will be injected into the + pod's hosts file if specified. This is only + valid for non-hostNetwork pods. + items: + description: HostAlias holds the mapping between + IP and hostnames that will be injected as + an entry in the pod's hosts file. + properties: + hostnames: + description: Hostnames for the above IP + address. + items: + type: string + type: array + ip: + description: IP address of the host file + entry. + type: string + type: object + type: array + hostIPC: + description: 'Use the host''s ipc namespace. Optional: + Default to false.' + type: boolean + hostNetwork: + description: Host networking requested for this + pod. Use the host's network namespace. If this + option is set, the ports that will be used must + be specified. Default to false. + type: boolean + hostPID: + description: 'Use the host''s pid namespace. Optional: + Default to false.' + type: boolean + hostname: + description: Specifies the hostname of the Pod + If not specified, the pod's hostname will be + set to a system-defined value. + type: string + imagePullSecrets: + description: 'ImagePullSecrets is an optional + list of references to secrets in the same namespace + to use for pulling any of the images used by + this PodSpec. If specified, these secrets will + be passed to individual puller implementations + for them to use. For example, in the case of + docker, only DockerConfig type secrets are honored. + More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod' + items: + description: LocalObjectReference contains enough + information to let you locate the referenced + object inside the same namespace. + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + type: array + initContainers: + description: 'List of initialization containers + belonging to the pod. Init containers are executed + in order prior to containers being started. + If any init container fails, the pod is considered + to have failed and is handled according to its + restartPolicy. The name for an init container + or normal container must be unique among all + containers. Init containers may not have Lifecycle + actions, Readiness probes, Liveness probes, + or Startup probes. The resourceRequirements + of an init container are taken into account + during scheduling by finding the highest request/limit + for each resource type, and then using the max + of of that value or the sum of the normal containers. + Limits are applied to init containers in a similar + fashion. Init containers cannot currently be + added or removed. Cannot be updated. More info: + https://kubernetes.io/docs/concepts/workloads/pods/init-containers/' + items: + description: A single application container + that you want to run within a pod. + properties: + args: + description: 'Arguments to the entrypoint. + The docker image''s CMD is used if this + is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. + If a variable cannot be resolved, the + reference in the input string will be + unchanged. Double $$ are reduced to a + single $, which allows for escaping the + $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, + regardless of whether the variable exists + or not. Cannot be updated. More info: + https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed + within a shell. The docker image''s ENTRYPOINT + is used if this is not provided. Variable + references $(VAR_NAME) are expanded using + the container''s environment. If a variable + cannot be resolved, the reference in the + input string will be unchanged. Double + $$ are reduced to a single $, which allows + for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string + literal "$(VAR_NAME)". Escaped references + will never be expanded, regardless of + whether the variable exists or not. Cannot + be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables + to set in the container. Cannot be updated. + items: + description: EnvVar represents an environment + variable present in a Container. + properties: + name: + description: Name of the environment + variable. Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references + $(VAR_NAME) are expanded using the + previously defined environment variables + in the container and any service + environment variables. If a variable + cannot be resolved, the reference + in the input string will be unchanged. + Double $$ are reduced to a single + $, which allows for escaping the + $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal + "$(VAR_NAME)". Escaped references + will never be expanded, regardless + of whether the variable exists or + not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment + variable's value. Cannot be used + if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of + a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the + referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether + the ConfigMap or its key + must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field + of the pod: supports metadata.name, + metadata.namespace, `metadata.labels['''']`, + `metadata.annotations['''']`, + spec.nodeName, spec.serviceAccountName, + status.hostIP, status.podIP, + status.podIPs.' + properties: + apiVersion: + description: Version of the + schema the FieldPath is + written in terms of, defaults + to "v1". + type: string + fieldPath: + description: Path of the field + to select in the specified + API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource + of the container: only resources + limits and requests (limits.cpu, + limits.memory, limits.ephemeral-storage, + requests.cpu, requests.memory + and requests.ephemeral-storage) + are currently supported.' + properties: + containerName: + description: 'Container name: + required for volumes, optional + for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the + output format of the exposed + resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource + to select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of + a secret in the pod's namespace + properties: + key: + description: The key of the + secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the + referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether + the Secret or its key must + be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate + environment variables in the container. + The keys defined within a source must + be a C_IDENTIFIER. All invalid keys will + be reported as an event when the container + is starting. When a key exists in multiple + sources, the value associated with the + last source will take precedence. Values + defined by an Env with a duplicate key + will take precedence. Cannot be updated. + items: + description: EnvFromSource represents + the source of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select + from + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the + ConfigMap must be defined + type: boolean + type: object + prefix: + description: An optional identifier + to prepend to each key in the ConfigMap. + Must be a C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select + from + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the + Secret must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: + https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher + level config management to default or + override container images in workload + controllers like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of + Always, Never, IfNotPresent. Defaults + to Always if :latest tag is specified, + or IfNotPresent otherwise. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Actions that the management + system should take in response to container + lifecycle events. Cannot be updated. + properties: + postStart: + description: 'PostStart is called immediately + after a container is created. If the + handler fails, the container is terminated + and restarted according to its restart + policy. Other management of the container + blocks until the hook completes. More + info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of + the following should be specified. + Exec specifies the action to take. + properties: + command: + description: Command is the + command line to execute inside + the container, the working + directory for the command is + root ('/') in the container's + filesystem. The command is + simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, + you need to explicitly call + out to that shell. Exit status + of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the + http request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. + You probably want to set "Host" + in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers + to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes + a custom header to be used + in HTTP probes + properties: + name: + description: The header + field name + type: string + value: + description: The header + field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access + on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for + connecting to the host. Defaults + to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies + an action involving a TCP port. + TCP hooks not yet supported TODO: + implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host + name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately + before a container is terminated due + to an API request or management event + such as liveness/startup probe failure, + preemption, resource contention, etc. + The handler is not called if the container + crashes or exits. The reason for termination + is passed to the handler. The Pod''s + termination grace period countdown + begins before the PreStop hooked is + executed. Regardless of the outcome + of the handler, the container will + eventually terminate within the Pod''s + termination grace period. Other management + of the container blocks until the + hook completes or until the termination + grace period is reached. More info: + https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of + the following should be specified. + Exec specifies the action to take. + properties: + command: + description: Command is the + command line to execute inside + the container, the working + directory for the command is + root ('/') in the container's + filesystem. The command is + simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, + you need to explicitly call + out to that shell. Exit status + of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the + http request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. + You probably want to set "Host" + in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers + to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes + a custom header to be used + in HTTP probes + properties: + name: + description: The header + field name + type: string + value: + description: The header + field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access + on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for + connecting to the host. Defaults + to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies + an action involving a TCP port. + TCP hooks not yet supported TODO: + implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host + name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name + of the port to access on the + container. Number must be + in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: 'Periodic probe of container + liveness. Container will be restarted + if the probe fails. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the container specified + as a DNS_LABEL. Each container in a pod + must have a unique name (DNS_LABEL). Cannot + be updated. + type: string + ports: + description: List of ports to expose from + the container. Exposing a port here gives + the system additional information about + the network connections a container uses, + but is primarily informational. Not specifying + a port here DOES NOT prevent that port + from being exposed. Any port which is + listening on the default "0.0.0.0" address + inside a container will be accessible + from the network. Cannot be updated. + items: + description: ContainerPort represents + a network port in a single container. + properties: + containerPort: + description: Number of port to expose + on the pod's IP address. This must + be a valid port number, 0 < x < + 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind + the external port to. + type: string + hostPort: + description: Number of port to expose + on the host. If specified, this + must be a valid port number, 0 < + x < 65536. If HostNetwork is specified, + this must match ContainerPort. Most + containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must + be an IANA_SVC_NAME and unique within + the pod. Each named port in a pod + must have a unique name. Name for + the port that can be referred to + by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must + be UDP, TCP, or SCTP. Defaults to + "TCP". + type: string + required: + - containerPort + type: object + type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map + readinessProbe: + description: 'Periodic probe of container + service readiness. Container will be removed + from service endpoints if the probe fails. + Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: 'Compute Resources required + by this container. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum + amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the + minimum amount of compute resources + required. If Requests is omitted for + a container, it defaults to Limits + if that is explicitly specified, otherwise + to an implementation-defined value. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'SecurityContext defines the + security options the container should + be run with. If set, the fields of SecurityContext + override the equivalent fields of PodSecurityContext. + More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation + controls whether a process can gain + more privileges than its parent process. + This bool directly controls if the + no_new_privs flag will be set on the + container process. AllowPrivilegeEscalation + is true always when the container + is: 1) run as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop + when running containers. Defaults + to the default set of capabilities + granted by the container runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent + POSIX capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent + POSIX capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged + mode. Processes in privileged containers + are essentially equivalent to root + on the host. Defaults to false. + type: boolean + procMount: + description: procMount denotes the type + of proc mount to use for the containers. + The default is DefaultProcMount which + uses the container runtime defaults + for readonly paths and masked paths. + This requires the ProcMountType feature + flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container + has a read-only root filesystem. Default + is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint + of the container process. Uses runtime + default if unset. May also be set + in PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container + must run as a non-root user. If true, + the Kubelet will validate the image + at runtime to ensure that it does + not run as UID 0 (root) and fail to + start the container if it does. If + unset or false, no such validation + will be performed. May also be set + in PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint + of the container process. Defaults + to user specified in image metadata + if unspecified. May also be set in + PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to + be applied to the container. If unspecified, + the container runtime will allocate + a random SELinux context for each + container. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + properties: + level: + description: Level is SELinux level + label that applies to the container. + type: string + role: + description: Role is a SELinux role + label that applies to the container. + type: string + type: + description: Type is a SELinux type + label that applies to the container. + type: string + user: + description: User is a SELinux user + label that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to + use by this container. If seccomp + options are provided at both the pod + & container level, the container options + override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates + a profile defined in a file on + the node should be used. The profile + must be preconfigured on the node + to work. Must be a descending + path, relative to the kubelet's + configured seccomp profile location. + Must only be set if type is "Localhost". + type: string + type: + description: "type indicates which + kind of seccomp profile will be + applied. Valid options are: \n + Localhost - a profile defined + in a file on the node should be + used. RuntimeDefault - the container + runtime default profile should + be used. Unconfined - no profile + should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings + applied to all containers. If unspecified, + the options from the PodSecurityContext + will be used. If set in both SecurityContext + and PodSecurityContext, the value + specified in SecurityContext takes + precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec + is where the GMSA admission webhook + (https://github.com/kubernetes-sigs/windows-gmsa) + inlines the contents of the GMSA + credential spec named by the GMSACredentialSpecName + field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName + is the name of the GMSA credential + spec to use. + type: string + hostProcess: + description: HostProcess determines + if a container should be run as + a 'Host Process' container. This + field is alpha-level and will + only be honored by components + that enable the WindowsHostProcessContainers + feature flag. Setting this field + without the feature flag will + result in errors when validating + the Pod. All of a Pod's containers + must have the same effective HostProcess + value (it is not allowed to have + a mix of HostProcess containers + and non-HostProcess containers). In + addition, if HostProcess is true + then HostNetwork must also be + set to true. + type: boolean + runAsUserName: + description: The UserName in Windows + to run the entrypoint of the container + process. Defaults to the user + specified in image metadata if + unspecified. May also be set in + PodSecurityContext. If set in + both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: string + type: object + type: object + startupProbe: + description: 'StartupProbe indicates that + the Pod has successfully initialized. + If specified, no other probes are executed + until this completes successfully. If + this probe fails, the Pod will be restarted, + just as if the livenessProbe failed. This + can be used to provide different probe + parameters at the beginning of a Pod''s + lifecycle, when it might take a long time + to load data or warm a cache, than during + steady-state operation. This cannot be + updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the + following should be specified. Exec + specifies the action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the + command is root ('/') in the + container's filesystem. The command + is simply exec'd, it is not run + inside a shell, so traditional + shell instructions ('|', etc) + won't work. To use a shell, you + need to explicitly call out to + that shell. Exit status of 0 is + treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures + for the probe to be considered failed + after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect + to, defaults to the pod IP. You + probably want to set "Host" in + httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set + in the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in + HTTP probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the + HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after + the container has started before liveness + probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) + to perform the probe. Default to 10 + seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes + for the probe to be considered successful + after having failed. Defaults to 1. + Must be 1 for liveness and startup. + Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an + action involving a TCP port. TCP hooks + not yet supported TODO: implement + a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name + to connect to, defaults to the + pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the + port to access on the container. + Number must be in the range 1 + to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully + upon probe failure. The grace period + is the duration in seconds after the + processes running in the pod are sent + a termination signal and the time + when the processes are forcibly halted + with a kill signal. Set this value + longer than the expected cleanup time + for your process. If this value is + nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value + overrides the value provided by the + pod spec. Value must be non-negative + integer. The value zero indicates + stop immediately via the kill signal + (no opportunity to shut down). This + is a beta field and requires enabling + ProbeTerminationGracePeriod feature + gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after + which the probe times out. Defaults + to 1 second. Minimum value is 1. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should + allocate a buffer for stdin in the container + runtime. If this is not set, reads from + stdin in the container will always result + in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime + should close the stdin channel after it + has been opened by a single attach. When + stdin is true the stdin stream will remain + open across multiple attach sessions. + If stdinOnce is set to true, stdin is + opened on container start, is empty until + the first client attaches to stdin, and + then remains open and accepts data until + the client disconnects, at which time + stdin is closed and remains closed until + the container is restarted. If this flag + is false, a container processes that reads + from stdin will never receive an EOF. + Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the + file to which the container''s termination + message will be written is mounted into + the container''s filesystem. Message written + is intended to be brief final status, + such as an assertion failure message. + Will be truncated by the node if greater + than 4096 bytes. The total message length + across all containers will be limited + to 12kb. Defaults to /dev/termination-log. + Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination + message should be populated. File will + use the contents of terminationMessagePath + to populate the container status message + on both success and failure. FallbackToLogsOnError + will use the last chunk of container log + output if the termination message file + is empty and the container exited with + an error. The log output is limited to + 2048 bytes or 80 lines, whichever is smaller. + Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should + allocate a TTY for itself, also requires + 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of + block devices to be used by the container. + items: + description: volumeDevice describes a + mapping of a raw block device within + a container. + properties: + devicePath: + description: devicePath is the path + inside of the container that the + device will be mapped to. + type: string + name: + description: name must match the name + of a persistentVolumeClaim in the + pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the + container's filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting + of a Volume within a container. + properties: + mountPath: + description: Path within the container + at which the volume should be mounted. Must + not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines + how mounts are propagated from the + host to container and the other + way around. When not set, MountPropagationNone + is used. This field is beta in 1.10. + type: string + name: + description: This must match the Name + of a Volume. + type: string + readOnly: + description: Mounted read-only if + true, read-write otherwise (false + or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume + from which the container's volume + should be mounted. Defaults to "" + (volume's root). + type: string + subPathExpr: + description: Expanded path within + the volume from which the container's + volume should be mounted. Behaves + similarly to SubPath but environment + variable references $(VAR_NAME) + are expanded using the container's + environment. Defaults to "" (volume's + root). SubPathExpr and SubPath are + mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. + If not specified, the container runtime's + default will be used, which might be configured + in the container image. Cannot be updated. + type: string + required: + - name + type: object + type: array + nodeName: + description: NodeName is a request to schedule + this pod onto a specific node. If it is non-empty, + the scheduler simply schedules this pod onto + that node, assuming that it fits resource requirements. + type: string + nodeSelector: + additionalProperties: + type: string + description: 'NodeSelector is a selector which + must be true for the pod to fit on a node. Selector + which must match a node''s labels for the pod + to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/' + type: object + x-kubernetes-map-type: atomic + overhead: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Overhead represents the resource + overhead associated with running a pod for a + given RuntimeClass. This field will be autopopulated + at admission time by the RuntimeClass admission + controller. If the RuntimeClass admission controller + is enabled, overhead must not be set in Pod + create requests. The RuntimeClass admission + controller will reject Pod create requests which + have the overhead already set. If RuntimeClass + is configured and selected in the PodSpec, Overhead + will be set to the value defined in the corresponding + RuntimeClass, otherwise it will remain unset + and treated as zero. More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md + This field is beta-level as of Kubernetes v1.18, + and is only honored by servers that enable the + PodOverhead feature.' + type: object + preemptionPolicy: + description: PreemptionPolicy is the Policy for + preempting pods with lower priority. One of + Never, PreemptLowerPriority. Defaults to PreemptLowerPriority + if unset. This field is beta-level, gated by + the NonPreemptingPriority feature-gate. + type: string + priority: + description: The priority value. Various system + components use this field to find the priority + of the pod. When Priority Admission Controller + is enabled, it prevents users from setting this + field. The admission controller populates this + field from PriorityClassName. The higher the + value, the higher the priority. + format: int32 + type: integer + priorityClassName: + description: If specified, indicates the pod's + priority. "system-node-critical" and "system-cluster-critical" + are two special keywords which indicate the + highest priorities with the former being the + highest priority. Any other name must be defined + by creating a PriorityClass object with that + name. If not specified, the pod priority will + be default or zero if there is no default. + type: string + readinessGates: + description: 'If specified, all readiness gates + will be evaluated for pod readiness. A pod is + ready when all its containers are ready AND + all conditions specified in the readiness gates + have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates' + items: + description: PodReadinessGate contains the reference + to a pod condition + properties: + conditionType: + description: ConditionType refers to a condition + in the pod's condition list with matching + type. + type: string + required: + - conditionType + type: object + type: array + restartPolicy: + description: 'Restart policy for all containers + within the pod. One of Always, OnFailure, Never. + Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy' + type: string + runtimeClassName: + description: 'RuntimeClassName refers to a RuntimeClass + object in the node.k8s.io group, which should + be used to run this pod. If no RuntimeClass + resource matches the named class, the pod will + not be run. If unset or empty, the "legacy" + RuntimeClass will be used, which is an implicit + class with an empty definition that uses the + default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class + This is a beta feature as of Kubernetes v1.14.' + type: string + schedulerName: + description: If specified, the pod will be dispatched + by specified scheduler. If not specified, the + pod will be dispatched by default scheduler. + type: string + securityContext: + description: 'SecurityContext holds pod-level + security attributes and common container settings. + Optional: Defaults to empty. See type description + for default values of each field.' + properties: + fsGroup: + description: "A special supplemental group + that applies to all containers in a pod. + Some volume types allow the Kubelet to change + the ownership of that volume to be owned + by the pod: \n 1. The owning GID will be + the FSGroup 2. The setgid bit is set (new + files created in the volume will be owned + by FSGroup) 3. The permission bits are OR'd + with rw-rw---- \n If unset, the Kubelet + will not modify the ownership and permissions + of any volume." + format: int64 + type: integer + fsGroupChangePolicy: + description: 'fsGroupChangePolicy defines + behavior of changing ownership and permission + of the volume before being exposed inside + Pod. This field will only apply to volume + types which support fsGroup based ownership(and + permissions). It will have no effect on + ephemeral volume types such as: secret, + configmaps and emptydir. Valid values are + "OnRootMismatch" and "Always". If not specified, + "Always" is used.' + type: string + runAsGroup: + description: The GID to run the entrypoint + of the container process. Uses runtime default + if unset. May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence for that container. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container + must run as a non-root user. If true, the + Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 + (root) and fail to start the container if + it does. If unset or false, no such validation + will be performed. May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint + of the container process. Defaults to user + specified in image metadata if unspecified. + May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence for that container. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied + to all containers. If unspecified, the container + runtime will allocate a random SELinux context + for each container. May also be set in + SecurityContext. If set in both SecurityContext + and PodSecurityContext, the value specified + in SecurityContext takes precedence for + that container. + properties: + level: + description: Level is SELinux level label + that applies to the container. + type: string + role: + description: Role is a SELinux role label + that applies to the container. + type: string + type: + description: Type is a SELinux type label + that applies to the container. + type: string + user: + description: User is a SELinux user label + that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by + the containers in this pod. + properties: + localhostProfile: + description: localhostProfile indicates + a profile defined in a file on the node + should be used. The profile must be + preconfigured on the node to work. Must + be a descending path, relative to the + kubelet's configured seccomp profile + location. Must only be set if type is + "Localhost". + type: string + type: + description: "type indicates which kind + of seccomp profile will be applied. + Valid options are: \n Localhost - a + profile defined in a file on the node + should be used. RuntimeDefault - the + container runtime default profile should + be used. Unconfined - no profile should + be applied." + type: string + required: + - type + type: object + supplementalGroups: + description: A list of groups applied to the + first process run in each container, in + addition to the container's primary GID. If + unspecified, no groups will be added to + any container. + items: + format: int64 + type: integer + type: array + sysctls: + description: Sysctls hold a list of namespaced + sysctls used for the pod. Pods with unsupported + sysctls (by the container runtime) might + fail to launch. + items: + description: Sysctl defines a kernel parameter + to be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to + set + type: string + required: + - name + - value + type: object + type: array + windowsOptions: + description: The Windows specific settings + applied to all containers. If unspecified, + the options within a container's SecurityContext + will be used. If set in both SecurityContext + and PodSecurityContext, the value specified + in SecurityContext takes precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where + the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) + inlines the contents of the GMSA credential + spec named by the GMSACredentialSpecName + field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is + the name of the GMSA credential spec + to use. + type: string + hostProcess: + description: HostProcess determines if + a container should be run as a 'Host + Process' container. This field is alpha-level + and will only be honored by components + that enable the WindowsHostProcessContainers + feature flag. Setting this field without + the feature flag will result in errors + when validating the Pod. All of a Pod's + containers must have the same effective + HostProcess value (it is not allowed + to have a mix of HostProcess containers + and non-HostProcess containers). In + addition, if HostProcess is true then + HostNetwork must also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to + run the entrypoint of the container + process. Defaults to the user specified + in image metadata if unspecified. May + also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: string + type: object + type: object + serviceAccount: + description: 'DeprecatedServiceAccount is a depreciated + alias for ServiceAccountName. Deprecated: Use + serviceAccountName instead.' + type: string + serviceAccountName: + description: 'ServiceAccountName is the name of + the ServiceAccount to use to run this pod. More + info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/' + type: string + setHostnameAsFQDN: + description: If true the pod's hostname will be + configured as the pod's FQDN, rather than the + leaf name (the default). In Linux containers, + this means setting the FQDN in the hostname + field of the kernel (the nodename field of struct + utsname). In Windows containers, this means + setting the registry value of hostname for the + registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters + to FQDN. If a pod does not have FQDN, this has + no effect. Default to false. + type: boolean + shareProcessNamespace: + description: 'Share a single process namespace + between all of the containers in a pod. When + this is set containers will be able to view + and signal processes from other containers in + the same pod, and the first process in each + container will not be assigned PID 1. HostPID + and ShareProcessNamespace cannot both be set. + Optional: Default to false.' + type: boolean + subdomain: + description: If specified, the fully qualified + Pod hostname will be "...svc.". If not specified, + the pod will not have a domainname at all. + type: string + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully. May be decreased + in delete request. Value must be non-negative + integer. The value zero indicates stop immediately + via the kill signal (no opportunity to shut + down). If this value is nil, the default grace + period will be used instead. The grace period + is the duration in seconds after the processes + running in the pod are sent a termination signal + and the time when the processes are forcibly + halted with a kill signal. Set this value longer + than the expected cleanup time for your process. + Defaults to 30 seconds. + format: int64 + type: integer + tolerations: + description: If specified, the pod's tolerations. + items: + description: The pod this Toleration is attached + to tolerates any taint that matches the triple + using the matching operator + . + properties: + effect: + description: Effect indicates the taint + effect to match. Empty means match all + taint effects. When specified, allowed + values are NoSchedule, PreferNoSchedule + and NoExecute. + type: string + key: + description: Key is the taint key that the + toleration applies to. Empty means match + all taint keys. If the key is empty, operator + must be Exists; this combination means + to match all values and all keys. + type: string + operator: + description: Operator represents a key's + relationship to the value. Valid operators + are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, + so that a pod can tolerate all taints + of a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents + the period of time the toleration (which + must be of effect NoExecute, otherwise + this field is ignored) tolerates the taint. + By default, it is not set, which means + tolerate the taint forever (do not evict). + Zero and negative values will be treated + as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: Value is the taint value the + toleration matches to. If the operator + is Exists, the value should be empty, + otherwise just a regular string. + type: string + type: object + type: array + topologySpreadConstraints: + description: TopologySpreadConstraints describes + how a group of pods ought to spread across topology + domains. Scheduler will schedule pods in a way + which abides by the constraints. All topologySpreadConstraints + are ANDed. + items: + description: TopologySpreadConstraint specifies + how to spread matching pods among the given + topology. + properties: + labelSelector: + description: LabelSelector is used to find + matching pods. Pods that match this label + selector are counted to determine the + number of pods in their corresponding + topology domain. + properties: + matchExpressions: + description: matchExpressions is a list + of label selector requirements. The + requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a set + of values. Valid operators are + In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array + of string values. If the operator + is In or NotIn, the values array + must be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + This array is replaced during + a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of + {key,value} pairs. A single {key,value} + in the matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", the operator + is "In", and the values array contains + only "value". The requirements are + ANDed. + type: object + type: object + maxSkew: + description: 'MaxSkew describes the degree + to which pods may be unevenly distributed. + When `whenUnsatisfiable=DoNotSchedule`, + it is the maximum permitted difference + between the number of matching pods in + the target topology and the global minimum. + For example, in a 3-zone cluster, MaxSkew + is set to 1, and pods with the same labelSelector + spread as 1/1/0: | zone1 | zone2 | zone3 + | | P | P | | - if MaxSkew + is 1, incoming pod can only be scheduled + to zone3 to become 1/1/1; scheduling it + onto zone1(zone2) would make the ActualSkew(2-0) + on zone1(zone2) violate MaxSkew(1). - + if MaxSkew is 2, incoming pod can be scheduled + onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, + it is used to give higher precedence to + topologies that satisfy it. It''s a required + field. Default value is 1 and 0 is not + allowed.' + format: int32 + type: integer + topologyKey: + description: TopologyKey is the key of node + labels. Nodes that have a label with this + key and identical values are considered + to be in the same topology. We consider + each as a "bucket", and try + to put balanced number of pods into each + bucket. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates + how to deal with a pod if it doesn''t + satisfy the spread constraint. - DoNotSchedule + (default) tells the scheduler not to schedule + it. - ScheduleAnyway tells the scheduler + to schedule the pod in any location, but + giving higher precedence to topologies + that would help reduce the skew. A constraint + is considered "Unsatisfiable" for an incoming + pod if and only if every possible node + assigment for that pod would violate "MaxSkew" + on some topology. For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods + with the same labelSelector spread as + 3/1/1: | zone1 | zone2 | zone3 | | P P + P | P | P | If WhenUnsatisfiable + is set to DoNotSchedule, incoming pod + can only be scheduled to zone2(zone3) + to become 3/2/1(3/1/2) as ActualSkew(2-1) + on zone2(zone3) satisfies MaxSkew(1). + In other words, the cluster can still + be imbalanced, but scheduler won''t make + it *more* imbalanced. It''s a required + field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array + x-kubernetes-list-map-keys: + - topologyKey + - whenUnsatisfiable + x-kubernetes-list-type: map + volumes: + description: 'List of volumes that can be mounted + by containers belonging to the pod. More info: + https://kubernetes.io/docs/concepts/storage/volumes' + items: + description: Volume represents a named volume + in a pod that may be accessed by any container + in the pod. + properties: + awsElasticBlockStore: + description: 'AWSElasticBlockStore represents + an AWS Disk resource that is attached + to a kubelet''s host machine and then + exposed to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + properties: + fsType: + description: 'Filesystem type of the + volume that you want to mount. Tip: + Ensure that the filesystem type is + supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore + TODO: how do we prevent errors in + the filesystem from compromising the + machine' + type: string + partition: + description: 'The partition in the volume + that you want to mount. If omitted, + the default is to mount by volume + name. Examples: For volume /dev/sda1, + you specify the partition as "1". + Similarly, the volume partition for + /dev/sda is "0" (or you can leave + the property empty).' + format: int32 + type: integer + readOnly: + description: 'Specify "true" to force + and set the ReadOnly property in VolumeMounts + to "true". If omitted, the default + is "false". More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + type: boolean + volumeID: + description: 'Unique ID of the persistent + disk resource in AWS (Amazon EBS volume). + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + type: string + required: + - volumeID + type: object + azureDisk: + description: AzureDisk represents an Azure + Data Disk mount on the host and bind mount + to the pod. + properties: + cachingMode: + description: 'Host Caching mode: None, + Read Only, Read Write.' + type: string + diskName: + description: The Name of the data disk + in the blob storage + type: string + diskURI: + description: The URI the data disk in + the blob storage + type: string + fsType: + description: Filesystem type to mount. + Must be a filesystem type supported + by the host operating system. Ex. + "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + type: string + kind: + description: 'Expected values Shared: + multiple blob disks per storage account Dedicated: + single blob disk per storage account Managed: + azure managed data disk (only in managed + availability set). defaults to shared' + type: string + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly + setting in VolumeMounts. + type: boolean + required: + - diskName + - diskURI + type: object + azureFile: + description: AzureFile represents an Azure + File Service mount on the host and bind + mount to the pod. + properties: + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly + setting in VolumeMounts. + type: boolean + secretName: + description: the name of secret that + contains Azure Storage Account Name + and Key + type: string + shareName: + description: Share Name + type: string + required: + - secretName + - shareName + type: object + cephfs: + description: CephFS represents a Ceph FS + mount on the host that shares a pod's + lifetime + properties: + monitors: + description: 'Required: Monitors is + a collection of Ceph monitors More + info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + items: + type: string + type: array + path: + description: 'Optional: Used as the + mounted root, rather than the full + Ceph tree, default is /' + type: string + readOnly: + description: 'Optional: Defaults to + false (read/write). ReadOnly here + will force the ReadOnly setting in + VolumeMounts. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: boolean + secretFile: + description: 'Optional: SecretFile is + the path to key ring for User, default + is /etc/ceph/user.secret More info: + https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: string + secretRef: + description: 'Optional: SecretRef is + reference to the authentication secret + for User, default is empty. More info: + https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + type: object + user: + description: 'Optional: User is the + rados user name, default is admin + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: string + required: + - monitors + type: object + cinder: + description: 'Cinder represents a cinder + volume attached and mounted on kubelets + host machine. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + properties: + fsType: + description: 'Filesystem type to mount. + Must be a filesystem type supported + by the host operating system. Examples: + "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: string + readOnly: + description: 'Optional: Defaults to + false (read/write). ReadOnly here + will force the ReadOnly setting in + VolumeMounts. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: boolean + secretRef: + description: 'Optional: points to a + secret object containing parameters + used to connect to OpenStack.' + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + type: object + volumeID: + description: 'volume id used to identify + the volume in cinder. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: string + required: + - volumeID + type: object + configMap: + description: ConfigMap represents a configMap + that should populate this volume + properties: + defaultMode: + description: 'Optional: mode bits used + to set permissions on created files + by default. Must be an octal value + between 0000 and 0777 or a decimal + value between 0 and 511. YAML accepts + both octal and decimal values, JSON + requires decimal values for mode bits. + Defaults to 0644. Directories within + the path are not affected by this + setting. This might be in conflict + with other options that affect the + file mode, like fsGroup, and the result + can be other mode bits set.' + format: int32 + type: integer + items: + description: If unspecified, each key-value + pair in the Data field of the referenced + ConfigMap will be projected into the + volume as a file whose name is the + key and content is the value. If specified, + the listed keys will be projected + into the specified paths, and unlisted + keys will not be present. If a key + is specified which is not present + in the ConfigMap, the volume setup + will error unless it is marked optional. + Paths must be relative and may not + contain the '..' path or start with + '..'. + items: + description: Maps a string key to + a path within a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode bits + used to set permissions on this + file. Must be an octal value + between 0000 and 0777 or a decimal + value between 0 and 511. YAML + accepts both octal and decimal + values, JSON requires decimal + values for mode bits. If not + specified, the volume defaultMode + will be used. This might be + in conflict with other options + that affect the file mode, like + fsGroup, and the result can + be other mode bits set.' + format: int32 + type: integer + path: + description: The relative path + of the file to map the key to. + May not be an absolute path. + May not contain the path element + '..'. May not start with the + string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its keys must be defined + type: boolean + type: object + csi: + description: CSI (Container Storage Interface) + represents ephemeral storage that is handled + by certain external CSI drivers (Beta + feature). + properties: + driver: + description: Driver is the name of the + CSI driver that handles this volume. + Consult with your admin for the correct + name as registered in the cluster. + type: string + fsType: + description: Filesystem type to mount. + Ex. "ext4", "xfs", "ntfs". If not + provided, the empty value is passed + to the associated CSI driver which + will determine the default filesystem + to apply. + type: string + nodePublishSecretRef: + description: NodePublishSecretRef is + a reference to the secret object containing + sensitive information to pass to the + CSI driver to complete the CSI NodePublishVolume + and NodeUnpublishVolume calls. This + field is optional, and may be empty + if no secret is required. If the secret + object contains more than one secret, + all secret references are passed. + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + type: object + readOnly: + description: Specifies a read-only configuration + for the volume. Defaults to false + (read/write). + type: boolean + volumeAttributes: + additionalProperties: + type: string + description: VolumeAttributes stores + driver-specific properties that are + passed to the CSI driver. Consult + your driver's documentation for supported + values. + type: object + required: + - driver + type: object + downwardAPI: + description: DownwardAPI represents downward + API about the pod that should populate + this volume + properties: + defaultMode: + description: 'Optional: mode bits to + use on created files by default. Must + be a Optional: mode bits used to set + permissions on created files by default. + Must be an octal value between 0000 + and 0777 or a decimal value between + 0 and 511. YAML accepts both octal + and decimal values, JSON requires + decimal values for mode bits. Defaults + to 0644. Directories within the path + are not affected by this setting. + This might be in conflict with other + options that affect the file mode, + like fsGroup, and the result can be + other mode bits set.' + format: int32 + type: integer + items: + description: Items is a list of downward + API volume file + items: + description: DownwardAPIVolumeFile + represents information to create + the file containing the pod field + properties: + fieldRef: + description: 'Required: Selects + a field of the pod: only annotations, + labels, name and namespace are + supported.' + properties: + apiVersion: + description: Version of the + schema the FieldPath is + written in terms of, defaults + to "v1". + type: string + fieldPath: + description: Path of the field + to select in the specified + API version. + type: string + required: + - fieldPath + type: object + mode: + description: 'Optional: mode bits + used to set permissions on this + file, must be an octal value + between 0000 and 0777 or a decimal + value between 0 and 511. YAML + accepts both octal and decimal + values, JSON requires decimal + values for mode bits. If not + specified, the volume defaultMode + will be used. This might be + in conflict with other options + that affect the file mode, like + fsGroup, and the result can + be other mode bits set.' + format: int32 + type: integer + path: + description: 'Required: Path is the + relative path name of the file + to be created. Must not be absolute + or contain the ''..'' path. + Must be utf-8 encoded. The first + item of the relative path must + not start with ''..''' + type: string + resourceFieldRef: + description: 'Selects a resource + of the container: only resources + limits and requests (limits.cpu, + limits.memory, requests.cpu + and requests.memory) are currently + supported.' + properties: + containerName: + description: 'Container name: + required for volumes, optional + for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the + output format of the exposed + resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource + to select' + type: string + required: + - resource + type: object + required: + - path + type: object + type: array + type: object + emptyDir: + description: 'EmptyDir represents a temporary + directory that shares a pod''s lifetime. + More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + properties: + medium: + description: 'What type of storage medium + should back this directory. The default + is "" which means to use the node''s + default medium. Must be an empty string + (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + type: string + sizeLimit: + anyOf: + - type: integer + - type: string + description: 'Total amount of local + storage required for this EmptyDir + volume. The size limit is also applicable + for memory medium. The maximum usage + on memory medium EmptyDir would be + the minimum value between the SizeLimit + specified here and the sum of memory + limits of all containers in a pod. + The default is nil which means that + the limit is undefined. More info: + http://kubernetes.io/docs/user-guide/volumes#emptydir' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + ephemeral: + description: "Ephemeral represents a volume + that is handled by a cluster storage driver. + The volume's lifecycle is tied to the + pod that defines it - it will be created + before the pod starts, and deleted when + the pod is removed. \n Use this if: a) + the volume is only needed while the pod + runs, b) features of normal volumes like + restoring from snapshot or capacity tracking + are needed, c) the storage driver is specified + through a storage class, and d) the storage + driver supports dynamic volume provisioning + through a PersistentVolumeClaim (see EphemeralVolumeSource + for more information on the connection + between this volume type and PersistentVolumeClaim). + \n Use PersistentVolumeClaim or one of + the vendor-specific APIs for volumes that + persist for longer than the lifecycle + of an individual pod. \n Use CSI for light-weight + local ephemeral volumes if the CSI driver + is meant to be used that way - see the + documentation of the driver for more information. + \n A pod can use both types of ephemeral + volumes and persistent volumes at the + same time. \n This is a beta feature and + only available when the GenericEphemeralVolume + feature gate is enabled." + properties: + volumeClaimTemplate: + description: "Will be used to create + a stand-alone PVC to provision the + volume. The pod in which this EphemeralVolumeSource + is embedded will be the owner of the + PVC, i.e. the PVC will be deleted + together with the pod. The name of + the PVC will be `-` where `` is the + name from the `PodSpec.Volumes` array + entry. Pod validation will reject + the pod if the concatenated name is + not valid for a PVC (for example, + too long). \n An existing PVC with + that name that is not owned by the + pod will *not* be used for the pod + to avoid using an unrelated volume + by mistake. Starting the pod is then + blocked until the unrelated PVC is + removed. If such a pre-created PVC + is meant to be used by the pod, the + PVC has to updated with an owner reference + to the pod once the pod exists. Normally + this should not be necessary, but + it may be useful when manually reconstructing + a broken cluster. \n This field is + read-only and no changes will be made + by Kubernetes to the PVC after it + has been created. \n Required, must + not be nil." + properties: + metadata: + description: May contain labels + and annotations that will be copied + into the PVC when creating it. + No other fields are allowed and + will be rejected during validation. + type: object + spec: + description: The specification for + the PersistentVolumeClaim. The + entire content is copied unchanged + into the PVC that gets created + from this template. The same fields + as in a PersistentVolumeClaim + are also valid here. + properties: + accessModes: + description: 'AccessModes contains + the desired access modes the + volume should have. More info: + https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + dataSource: + description: 'This field can + be used to specify either: + * An existing VolumeSnapshot + object (snapshot.storage.k8s.io/VolumeSnapshot) + * An existing PVC (PersistentVolumeClaim) + If the provisioner or an external + controller can support the + specified data source, it + will create a new volume based + on the contents of the specified + data source. If the AnyVolumeDataSource + feature gate is enabled, this + field will always have the + same contents as the DataSourceRef + field.' + properties: + apiGroup: + description: APIGroup is + the group for the resource + being referenced. If APIGroup + is not specified, the + specified Kind must be + in the core API group. + For any other third-party + types, APIGroup is required. + type: string + kind: + description: Kind is the + type of resource being + referenced + type: string + name: + description: Name is the + name of resource being + referenced + type: string + required: + - kind + - name + type: object + dataSourceRef: + description: 'Specifies the + object from which to populate + the volume with data, if a + non-empty volume is desired. + This may be any local object + from a non-empty API group + (non core object) or a PersistentVolumeClaim + object. When this field is + specified, volume binding + will only succeed if the type + of the specified object matches + some installed volume populator + or dynamic provisioner. This + field will replace the functionality + of the DataSource field and + as such if both fields are + non-empty, they must have + the same value. For backwards + compatibility, both fields + (DataSource and DataSourceRef) + will be set to the same value + automatically if one of them + is empty and the other is + non-empty. There are two important + differences between DataSource + and DataSourceRef: * While + DataSource only allows two + specific types of objects, + DataSourceRef allows any non-core + object, as well as PersistentVolumeClaim + objects. * While DataSource + ignores disallowed values + (dropping them), DataSourceRef + preserves all values, and + generates an error if a disallowed + value is specified. (Alpha) + Using this field requires + the AnyVolumeDataSource feature + gate to be enabled.' + properties: + apiGroup: + description: APIGroup is + the group for the resource + being referenced. If APIGroup + is not specified, the + specified Kind must be + in the core API group. + For any other third-party + types, APIGroup is required. + type: string + kind: + description: Kind is the + type of resource being + referenced + type: string + name: + description: Name is the + name of resource being + referenced + type: string + required: + - kind + - name + type: object + resources: + description: 'Resources represents + the minimum resources the + volume should have. More info: + https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes + the maximum amount of + compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes + the minimum amount of + compute resources required. + If Requests is omitted + for a container, it defaults + to Limits if that is explicitly + specified, otherwise to + an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + selector: + description: A label query over + volumes to consider for binding. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, + a key, and an operator + that relates the key + and values. + properties: + key: + description: key is + the label key that + the selector applies + to. + type: string + operator: + description: operator + represents a key's + relationship to + a set of values. + Valid operators + are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values + is an array of string + values. If the operator + is In or NotIn, + the values array + must be non-empty. + If the operator + is Exists or DoesNotExist, + the values array + must be empty. This + array is replaced + during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels + is a map of {key,value} + pairs. A single {key,value} + in the matchLabels map + is equivalent to an element + of matchExpressions, whose + key field is "key", the + operator is "In", and + the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + storageClassName: + description: 'Name of the StorageClass + required by the claim. More + info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines + what type of volume is required + by the claim. Value of Filesystem + is implied when not included + in claim spec. + type: string + volumeName: + description: VolumeName is the + binding reference to the PersistentVolume + backing this claim. + type: string + type: object + required: + - spec + type: object + type: object + fc: + description: FC represents a Fibre Channel + resource that is attached to a kubelet's + host machine and then exposed to the pod. + properties: + fsType: + description: 'Filesystem type to mount. + Must be a filesystem type supported + by the host operating system. Ex. + "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + TODO: how do we prevent errors in + the filesystem from compromising the + machine' + type: string + lun: + description: 'Optional: FC target lun + number' + format: int32 + type: integer + readOnly: + description: 'Optional: Defaults to + false (read/write). ReadOnly here + will force the ReadOnly setting in + VolumeMounts.' + type: boolean + targetWWNs: + description: 'Optional: FC target worldwide + names (WWNs)' + items: + type: string + type: array + wwids: + description: 'Optional: FC volume world + wide identifiers (wwids) Either wwids + or combination of targetWWNs and lun + must be set, but not both simultaneously.' + items: + type: string + type: array + type: object + flexVolume: + description: FlexVolume represents a generic + volume resource that is provisioned/attached + using an exec based plugin. + properties: + driver: + description: Driver is the name of the + driver to use for this volume. + type: string + fsType: + description: Filesystem type to mount. + Must be a filesystem type supported + by the host operating system. Ex. + "ext4", "xfs", "ntfs". The default + filesystem depends on FlexVolume script. + type: string + options: + additionalProperties: + type: string + description: 'Optional: Extra command + options if any.' + type: object + readOnly: + description: 'Optional: Defaults to + false (read/write). ReadOnly here + will force the ReadOnly setting in + VolumeMounts.' + type: boolean + secretRef: + description: 'Optional: SecretRef is + reference to the secret object containing + sensitive information to pass to the + plugin scripts. This may be empty + if no secret object is specified. + If the secret object contains more + than one secret, all secrets are passed + to the plugin scripts.' + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + type: object + required: + - driver + type: object + flocker: + description: Flocker represents a Flocker + volume attached to a kubelet's host machine. + This depends on the Flocker control service + being running + properties: + datasetName: + description: Name of the dataset stored + as metadata -> name on the dataset + for Flocker should be considered as + deprecated + type: string + datasetUUID: + description: UUID of the dataset. This + is unique identifier of a Flocker + dataset + type: string + type: object + gcePersistentDisk: + description: 'GCEPersistentDisk represents + a GCE Disk resource that is attached to + a kubelet''s host machine and then exposed + to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + properties: + fsType: + description: 'Filesystem type of the + volume that you want to mount. Tip: + Ensure that the filesystem type is + supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk + TODO: how do we prevent errors in + the filesystem from compromising the + machine' + type: string + partition: + description: 'The partition in the volume + that you want to mount. If omitted, + the default is to mount by volume + name. Examples: For volume /dev/sda1, + you specify the partition as "1". + Similarly, the volume partition for + /dev/sda is "0" (or you can leave + the property empty). More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + format: int32 + type: integer + pdName: + description: 'Unique name of the PD + resource in GCE. Used to identify + the disk in GCE. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + type: string + readOnly: + description: 'ReadOnly here will force + the ReadOnly setting in VolumeMounts. + Defaults to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + type: boolean + required: + - pdName + type: object + gitRepo: + description: 'GitRepo represents a git repository + at a particular revision. DEPRECATED: + GitRepo is deprecated. To provision a + container with a git repo, mount an EmptyDir + into an InitContainer that clones the + repo using git, then mount the EmptyDir + into the Pod''s container.' + properties: + directory: + description: Target directory name. + Must not contain or start with '..'. If + '.' is supplied, the volume directory + will be the git repository. Otherwise, + if specified, the volume will contain + the git repository in the subdirectory + with the given name. + type: string + repository: + description: Repository URL + type: string + revision: + description: Commit hash for the specified + revision. + type: string + required: + - repository + type: object + glusterfs: + description: 'Glusterfs represents a Glusterfs + mount on the host that shares a pod''s + lifetime. More info: https://examples.k8s.io/volumes/glusterfs/README.md' + properties: + endpoints: + description: 'EndpointsName is the endpoint + name that details Glusterfs topology. + More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: string + path: + description: 'Path is the Glusterfs + volume path. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: string + readOnly: + description: 'ReadOnly here will force + the Glusterfs volume to be mounted + with read-only permissions. Defaults + to false. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: boolean + required: + - endpoints + - path + type: object + hostPath: + description: 'HostPath represents a pre-existing + file or directory on the host machine + that is directly exposed to the container. + This is generally used for system agents + or other privileged things that are allowed + to see the host machine. Most containers + will NOT need this. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath + --- TODO(jonesdl) We need to restrict + who can use host directory mounts and + who can/can not mount host directories + as read/write.' + properties: + path: + description: 'Path of the directory + on the host. If the path is a symlink, + it will follow the link to the real + path. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath' + type: string + type: + description: 'Type for HostPath Volume + Defaults to "" More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath' + type: string + required: + - path + type: object + iscsi: + description: 'ISCSI represents an ISCSI + Disk resource that is attached to a kubelet''s + host machine and then exposed to the pod. + More info: https://examples.k8s.io/volumes/iscsi/README.md' + properties: + chapAuthDiscovery: + description: whether support iSCSI Discovery + CHAP authentication + type: boolean + chapAuthSession: + description: whether support iSCSI Session + CHAP authentication + type: boolean + fsType: + description: 'Filesystem type of the + volume that you want to mount. Tip: + Ensure that the filesystem type is + supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#iscsi + TODO: how do we prevent errors in + the filesystem from compromising the + machine' + type: string + initiatorName: + description: Custom iSCSI Initiator + Name. If initiatorName is specified + with iscsiInterface simultaneously, + new iSCSI interface : will be created for the connection. + type: string + iqn: + description: Target iSCSI Qualified + Name. + type: string + iscsiInterface: + description: iSCSI Interface Name that + uses an iSCSI transport. Defaults + to 'default' (tcp). + type: string + lun: + description: iSCSI Target Lun number. + format: int32 + type: integer + portals: + description: iSCSI Target Portal List. + The portal is either an IP or ip_addr:port + if the port is other than default + (typically TCP ports 860 and 3260). + items: + type: string + type: array + readOnly: + description: ReadOnly here will force + the ReadOnly setting in VolumeMounts. + Defaults to false. + type: boolean + secretRef: + description: CHAP Secret for iSCSI target + and initiator authentication + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + type: object + targetPortal: + description: iSCSI Target Portal. The + Portal is either an IP or ip_addr:port + if the port is other than default + (typically TCP ports 860 and 3260). + type: string + required: + - iqn + - lun + - targetPortal + type: object + name: + description: 'Volume''s name. Must be a + DNS_LABEL and unique within the pod. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + nfs: + description: 'NFS represents an NFS mount + on the host that shares a pod''s lifetime + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + properties: + path: + description: 'Path that is exported + by the NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: string + readOnly: + description: 'ReadOnly here will force + the NFS export to be mounted with + read-only permissions. Defaults to + false. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: boolean + server: + description: 'Server is the hostname + or IP address of the NFS server. More + info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: string + required: + - path + - server + type: object + persistentVolumeClaim: + description: 'PersistentVolumeClaimVolumeSource + represents a reference to a PersistentVolumeClaim + in the same namespace. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' + properties: + claimName: + description: 'ClaimName is the name + of a PersistentVolumeClaim in the + same namespace as the pod using this + volume. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' + type: string + readOnly: + description: Will force the ReadOnly + setting in VolumeMounts. Default false. + type: boolean + required: + - claimName + type: object + photonPersistentDisk: + description: PhotonPersistentDisk represents + a PhotonController persistent disk attached + and mounted on kubelets host machine + properties: + fsType: + description: Filesystem type to mount. + Must be a filesystem type supported + by the host operating system. Ex. + "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + type: string + pdID: + description: ID that identifies Photon + Controller persistent disk + type: string + required: + - pdID + type: object + portworxVolume: + description: PortworxVolume represents a + portworx volume attached and mounted on + kubelets host machine + properties: + fsType: + description: FSType represents the filesystem + type to mount Must be a filesystem + type supported by the host operating + system. Ex. "ext4", "xfs". Implicitly + inferred to be "ext4" if unspecified. + type: string + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly + setting in VolumeMounts. + type: boolean + volumeID: + description: VolumeID uniquely identifies + a Portworx volume + type: string + required: + - volumeID + type: object + projected: + description: Items for all in one resources + secrets, configmaps, and downward API + properties: + defaultMode: + description: Mode bits used to set permissions + on created files by default. Must + be an octal value between 0000 and + 0777 or a decimal value between 0 + and 511. YAML accepts both octal and + decimal values, JSON requires decimal + values for mode bits. Directories + within the path are not affected by + this setting. This might be in conflict + with other options that affect the + file mode, like fsGroup, and the result + can be other mode bits set. + format: int32 + type: integer + sources: + description: list of volume projections + items: + description: Projection that may be + projected along with other supported + volume types + properties: + configMap: + description: information about + the configMap data to project + properties: + items: + description: If unspecified, + each key-value pair in the + Data field of the referenced + ConfigMap will be projected + into the volume as a file + whose name is the key and + content is the value. If + specified, the listed keys + will be projected into the + specified paths, and unlisted + keys will not be present. + If a key is specified which + is not present in the ConfigMap, + the volume setup will error + unless it is marked optional. + Paths must be relative and + may not contain the '..' + path or start with '..'. + items: + description: Maps a string + key to a path within a + volume. + properties: + key: + description: The key + to project. + type: string + mode: + description: 'Optional: + mode bits used to + set permissions on + this file. Must be + an octal value between + 0000 and 0777 or a + decimal value between + 0 and 511. YAML accepts + both octal and decimal + values, JSON requires + decimal values for + mode bits. If not + specified, the volume + defaultMode will be + used. This might be + in conflict with other + options that affect + the file mode, like + fsGroup, and the result + can be other mode + bits set.' + format: int32 + type: integer + path: + description: The relative + path of the file to + map the key to. May + not be an absolute + path. May not contain + the path element '..'. + May not start with + the string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the + referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether + the ConfigMap or its keys + must be defined + type: boolean + type: object + downwardAPI: + description: information about + the downwardAPI data to project + properties: + items: + description: Items is a list + of DownwardAPIVolume file + items: + description: DownwardAPIVolumeFile + represents information + to create the file containing + the pod field + properties: + fieldRef: + description: 'Required: + Selects a field of + the pod: only annotations, + labels, name and namespace + are supported.' + properties: + apiVersion: + description: Version + of the schema + the FieldPath + is written in + terms of, defaults + to "v1". + type: string + fieldPath: + description: Path + of the field to + select in the + specified API + version. + type: string + required: + - fieldPath + type: object + mode: + description: 'Optional: + mode bits used to + set permissions on + this file, must be + an octal value between + 0000 and 0777 or a + decimal value between + 0 and 511. YAML accepts + both octal and decimal + values, JSON requires + decimal values for + mode bits. If not + specified, the volume + defaultMode will be + used. This might be + in conflict with other + options that affect + the file mode, like + fsGroup, and the result + can be other mode + bits set.' + format: int32 + type: integer + path: + description: 'Required: + Path is the relative + path name of the file + to be created. Must + not be absolute or + contain the ''..'' + path. Must be utf-8 + encoded. The first + item of the relative + path must not start + with ''..''' + type: string + resourceFieldRef: + description: 'Selects + a resource of the + container: only resources + limits and requests + (limits.cpu, limits.memory, + requests.cpu and requests.memory) + are currently supported.' + properties: + containerName: + description: 'Container + name: required + for volumes, optional + for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies + the output format + of the exposed + resources, defaults + to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: + resource to select' + type: string + required: + - resource + type: object + required: + - path + type: object + type: array + type: object + secret: + description: information about + the secret data to project + properties: + items: + description: If unspecified, + each key-value pair in the + Data field of the referenced + Secret will be projected + into the volume as a file + whose name is the key and + content is the value. If + specified, the listed keys + will be projected into the + specified paths, and unlisted + keys will not be present. + If a key is specified which + is not present in the Secret, + the volume setup will error + unless it is marked optional. + Paths must be relative and + may not contain the '..' + path or start with '..'. + items: + description: Maps a string + key to a path within a + volume. + properties: + key: + description: The key + to project. + type: string + mode: + description: 'Optional: + mode bits used to + set permissions on + this file. Must be + an octal value between + 0000 and 0777 or a + decimal value between + 0 and 511. YAML accepts + both octal and decimal + values, JSON requires + decimal values for + mode bits. If not + specified, the volume + defaultMode will be + used. This might be + in conflict with other + options that affect + the file mode, like + fsGroup, and the result + can be other mode + bits set.' + format: int32 + type: integer + path: + description: The relative + path of the file to + map the key to. May + not be an absolute + path. May not contain + the path element '..'. + May not start with + the string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the + referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + optional: + description: Specify whether + the Secret or its key must + be defined + type: boolean + type: object + serviceAccountToken: + description: information about + the serviceAccountToken data + to project + properties: + audience: + description: Audience is the + intended audience of the + token. A recipient of a + token must identify itself + with an identifier specified + in the audience of the token, + and otherwise should reject + the token. The audience + defaults to the identifier + of the apiserver. + type: string + expirationSeconds: + description: ExpirationSeconds + is the requested duration + of validity of the service + account token. As the token + approaches expiration, the + kubelet volume plugin will + proactively rotate the service + account token. The kubelet + will start trying to rotate + the token if the token is + older than 80 percent of + its time to live or if the + token is older than 24 hours.Defaults + to 1 hour and must be at + least 10 minutes. + format: int64 + type: integer + path: + description: Path is the path + relative to the mount point + of the file to project the + token into. + type: string + required: + - path + type: object + type: object + type: array + type: object + quobyte: + description: Quobyte represents a Quobyte + mount on the host that shares a pod's + lifetime + properties: + group: + description: Group to map volume access + to Default is no group + type: string + readOnly: + description: ReadOnly here will force + the Quobyte volume to be mounted with + read-only permissions. Defaults to + false. + type: boolean + registry: + description: Registry represents a single + or multiple Quobyte Registry services + specified as a string as host:port + pair (multiple entries are separated + with commas) which acts as the central + registry for volumes + type: string + tenant: + description: Tenant owning the given + Quobyte volume in the Backend Used + with dynamically provisioned Quobyte + volumes, value is set by the plugin + type: string + user: + description: User to map volume access + to Defaults to serivceaccount user + type: string + volume: + description: Volume is a string that + references an already created Quobyte + volume by name. + type: string + required: + - registry + - volume + type: object + rbd: + description: 'RBD represents a Rados Block + Device mount on the host that shares a + pod''s lifetime. More info: https://examples.k8s.io/volumes/rbd/README.md' + properties: + fsType: + description: 'Filesystem type of the + volume that you want to mount. Tip: + Ensure that the filesystem type is + supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#rbd + TODO: how do we prevent errors in + the filesystem from compromising the + machine' + type: string + image: + description: 'The rados image name. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + keyring: + description: 'Keyring is the path to + key ring for RBDUser. Default is /etc/ceph/keyring. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + monitors: + description: 'A collection of Ceph monitors. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + items: + type: string + type: array + pool: + description: 'The rados pool name. Default + is rbd. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + readOnly: + description: 'ReadOnly here will force + the ReadOnly setting in VolumeMounts. + Defaults to false. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: boolean + secretRef: + description: 'SecretRef is name of the + authentication secret for RBDUser. + If provided overrides keyring. Default + is nil. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + type: object + user: + description: 'The rados user name. Default + is admin. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + required: + - image + - monitors + type: object + scaleIO: + description: ScaleIO represents a ScaleIO + persistent volume attached and mounted + on Kubernetes nodes. + properties: + fsType: + description: Filesystem type to mount. + Must be a filesystem type supported + by the host operating system. Ex. + "ext4", "xfs", "ntfs". Default is + "xfs". + type: string + gateway: + description: The host address of the + ScaleIO API Gateway. + type: string + protectionDomain: + description: The name of the ScaleIO + Protection Domain for the configured + storage. + type: string + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly + setting in VolumeMounts. + type: boolean + secretRef: + description: SecretRef references to + the secret for ScaleIO user and other + sensitive information. If this is + not provided, Login operation will + fail. + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + type: object + sslEnabled: + description: Flag to enable/disable + SSL communication with Gateway, default + false + type: boolean + storageMode: + description: Indicates whether the storage + for a volume should be ThickProvisioned + or ThinProvisioned. Default is ThinProvisioned. + type: string + storagePool: + description: The ScaleIO Storage Pool + associated with the protection domain. + type: string + system: + description: The name of the storage + system as configured in ScaleIO. + type: string + volumeName: + description: The name of a volume already + created in the ScaleIO system that + is associated with this volume source. + type: string + required: + - gateway + - secretRef + - system + type: object + secret: + description: 'Secret represents a secret + that should populate this volume. More + info: https://kubernetes.io/docs/concepts/storage/volumes#secret' + properties: + defaultMode: + description: 'Optional: mode bits used + to set permissions on created files + by default. Must be an octal value + between 0000 and 0777 or a decimal + value between 0 and 511. YAML accepts + both octal and decimal values, JSON + requires decimal values for mode bits. + Defaults to 0644. Directories within + the path are not affected by this + setting. This might be in conflict + with other options that affect the + file mode, like fsGroup, and the result + can be other mode bits set.' + format: int32 + type: integer + items: + description: If unspecified, each key-value + pair in the Data field of the referenced + Secret will be projected into the + volume as a file whose name is the + key and content is the value. If specified, + the listed keys will be projected + into the specified paths, and unlisted + keys will not be present. If a key + is specified which is not present + in the Secret, the volume setup will + error unless it is marked optional. + Paths must be relative and may not + contain the '..' path or start with + '..'. + items: + description: Maps a string key to + a path within a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode bits + used to set permissions on this + file. Must be an octal value + between 0000 and 0777 or a decimal + value between 0 and 511. YAML + accepts both octal and decimal + values, JSON requires decimal + values for mode bits. If not + specified, the volume defaultMode + will be used. This might be + in conflict with other options + that affect the file mode, like + fsGroup, and the result can + be other mode bits set.' + format: int32 + type: integer + path: + description: The relative path + of the file to map the key to. + May not be an absolute path. + May not contain the path element + '..'. May not start with the + string '..'. + type: string + required: + - key + - path + type: object + type: array + optional: + description: Specify whether the Secret + or its keys must be defined + type: boolean + secretName: + description: 'Name of the secret in + the pod''s namespace to use. More + info: https://kubernetes.io/docs/concepts/storage/volumes#secret' + type: string + type: object + storageos: + description: StorageOS represents a StorageOS + volume attached and mounted on Kubernetes + nodes. + properties: + fsType: + description: Filesystem type to mount. + Must be a filesystem type supported + by the host operating system. Ex. + "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + type: string + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly + setting in VolumeMounts. + type: boolean + secretRef: + description: SecretRef specifies the + secret to use for obtaining the StorageOS + API credentials. If not specified, + default values will be attempted. + properties: + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. + apiVersion, kind, uid?' + type: string + type: object + volumeName: + description: VolumeName is the human-readable + name of the StorageOS volume. Volume + names are only unique within a namespace. + type: string + volumeNamespace: + description: VolumeNamespace specifies + the scope of the volume within StorageOS. If + no namespace is specified then the + Pod's namespace will be used. This + allows the Kubernetes name scoping + to be mirrored within StorageOS for + tighter integration. Set VolumeName + to any name to override the default + behaviour. Set to "default" if you + are not using namespaces within StorageOS. + Namespaces that do not pre-exist within + StorageOS will be created. + type: string + type: object + vsphereVolume: + description: VsphereVolume represents a + vSphere volume attached and mounted on + kubelets host machine + properties: + fsType: + description: Filesystem type to mount. + Must be a filesystem type supported + by the host operating system. Ex. + "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. + type: string + storagePolicyID: + description: Storage Policy Based Management + (SPBM) profile ID associated with + the StoragePolicyName. + type: string + storagePolicyName: + description: Storage Policy Based Management + (SPBM) profile name. + type: string + volumePath: + description: Path that identifies vSphere + volume vmdk + type: string + required: + - volumePath + type: object + required: + - name + type: object + type: array + required: + - containers + type: object + type: object + ttl: + default: '''24h''' + description: Time-to-Live period for the reservation. + `expires` and `ttl` are mutually exclusive. Defaults + to 24h. Set 0 to disable expiration. + type: string + type: object + type: object + type: object + ttl: + description: TTL controls the PodMigrationJob timeout duration. + type: string + required: + - podRef + type: object + status: + properties: + conditions: + description: Conditions records the stats of PodMigrationJob + items: + properties: + lastProbeTime: + description: Last time we probed the condition. + format: date-time + nullable: true + type: string + lastTransitionTime: + description: Last time the condition transitioned from one status + to another. + format: date-time + nullable: true + type: string + message: + description: Human-readable message indicating details about + last transition. + type: string + reason: + description: Unique, one-word, CamelCase reason for the condition's + last transition. + type: string + status: + description: Status is the status of the condition. Can be True, + False, Unknown. + type: string + type: + description: Type is the type of the condition. + type: string + required: + - status + - type + type: object + type: array + message: + description: Message represents a human-readable message indicating + details about why the PodMigrationJob is in this state. + type: string + nodeName: + description: NodeName represents the node's name of migrated Pod + type: string + phase: + description: PodMigrationJobPhase represents the phase of a PodMigrationJob + is a simple, high-level summary of where the PodMigrationJob is + in its lifecycle. e.g. Pending/Running/Failed + type: string + podsRef: + description: PodsRef represents the newly created Pods after being + migrated + items: + description: 'ObjectReference contains enough information to let + you inspect or modify the referred object. --- New uses of this + type are discouraged because of difficulty describing its usage + when embedded in APIs. 1. Ignored fields. It includes many fields + which are not generally honored. For instance, ResourceVersion + and FieldPath are both very rarely valid in actual usage. 2. Invalid + usage help. It is impossible to add specific help for individual + usage. In most embedded usages, there are particular restrictions + like, "must refer only to types A and B" or "UID not honored" + or "name must be restricted". Those cannot be well described when + embedded. 3. Inconsistent validation. Because the usages are + different, the validation rules are different by usage, which + makes it hard for users to predict what will happen. 4. The fields + are both imprecise and overly precise. Kind is not a precise + mapping to a URL. This can produce ambiguity during interpretation + and require a REST mapping. In most cases, the dependency is + on the group,resource tuple and the version of the actual struct + is irrelevant. 5. We cannot easily change it. Because this type + is embedded in many locations, updates to this type will affect + numerous schemas. Don''t make new APIs embed an underspecified + API type they do not control. Instead of using this type, create + a locally provided and used type that is well-focused on your + reference. For example, ServiceReferences for admission registration: + https://github.com/kubernetes/api/blob/release-1.17/admissionregistration/v1/types.go#L533 + .' + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: 'If referring to a piece of an object instead of + an entire object, this string should contain a valid JSON/Go + field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within + a pod, this would take on a value like: "spec.containers{name}" + (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" + (container with index 2 in this pod). This syntax is chosen + only to have some well-defined way of referencing a part of + an object. TODO: this design is not final and this field is + subject to change in the future.' + type: string + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + namespace: + description: 'Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/' + type: string + resourceVersion: + description: 'Specific resourceVersion to which this reference + is made, if any. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' + type: string + uid: + description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids' + type: string + type: object + type: array + preemptedPodsRef: + description: PreemptedPodsRef represents the Pods that be preempted + items: + description: 'ObjectReference contains enough information to let + you inspect or modify the referred object. --- New uses of this + type are discouraged because of difficulty describing its usage + when embedded in APIs. 1. Ignored fields. It includes many fields + which are not generally honored. For instance, ResourceVersion + and FieldPath are both very rarely valid in actual usage. 2. Invalid + usage help. It is impossible to add specific help for individual + usage. In most embedded usages, there are particular restrictions + like, "must refer only to types A and B" or "UID not honored" + or "name must be restricted". Those cannot be well described when + embedded. 3. Inconsistent validation. Because the usages are + different, the validation rules are different by usage, which + makes it hard for users to predict what will happen. 4. The fields + are both imprecise and overly precise. Kind is not a precise + mapping to a URL. This can produce ambiguity during interpretation + and require a REST mapping. In most cases, the dependency is + on the group,resource tuple and the version of the actual struct + is irrelevant. 5. We cannot easily change it. Because this type + is embedded in many locations, updates to this type will affect + numerous schemas. Don''t make new APIs embed an underspecified + API type they do not control. Instead of using this type, create + a locally provided and used type that is well-focused on your + reference. For example, ServiceReferences for admission registration: + https://github.com/kubernetes/api/blob/release-1.17/admissionregistration/v1/types.go#L533 + .' + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: 'If referring to a piece of an object instead of + an entire object, this string should contain a valid JSON/Go + field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within + a pod, this would take on a value like: "spec.containers{name}" + (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" + (container with index 2 in this pod). This syntax is chosen + only to have some well-defined way of referencing a part of + an object. TODO: this design is not final and this field is + subject to change in the future.' + type: string + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + namespace: + description: 'Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/' + type: string + resourceVersion: + description: 'Specific resourceVersion to which this reference + is made, if any. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' + type: string + uid: + description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids' + type: string + type: object + type: array + preemptedPodsReservation: + description: PreemptedPodsReservations records information about Reservations + created due to preemption + items: + properties: + name: + description: Name represents the name of Reservation + type: string + namespace: + description: Namespace represents the namespace of Reservation + type: string + nodeName: + description: NodeName represents the assigned node for Reservation + by scheduler + type: string + phase: + description: Phase represents the Phase of Reservation + type: string + podsRef: + description: PodsRef represents the newly created Pods after + being preempted + items: + description: 'ObjectReference contains enough information + to let you inspect or modify the referred object. --- New + uses of this type are discouraged because of difficulty + describing its usage when embedded in APIs. 1. Ignored fields. It + includes many fields which are not generally honored. For + instance, ResourceVersion and FieldPath are both very rarely + valid in actual usage. 2. Invalid usage help. It is impossible + to add specific help for individual usage. In most embedded + usages, there are particular restrictions like, "must refer + only to types A and B" or "UID not honored" or "name must + be restricted". Those cannot be well described when embedded. + 3. Inconsistent validation. Because the usages are different, + the validation rules are different by usage, which makes + it hard for users to predict what will happen. 4. The fields + are both imprecise and overly precise. Kind is not a precise + mapping to a URL. This can produce ambiguity during interpretation + and require a REST mapping. In most cases, the dependency + is on the group,resource tuple and the version of the actual + struct is irrelevant. 5. We cannot easily change it. Because + this type is embedded in many locations, updates to this + type will affect numerous schemas. Don''t make new APIs + embed an underspecified API type they do not control. Instead + of using this type, create a locally provided and used type + that is well-focused on your reference. For example, ServiceReferences + for admission registration: https://github.com/kubernetes/api/blob/release-1.17/admissionregistration/v1/types.go#L533 + .' + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: 'If referring to a piece of an object instead + of an entire object, this string should contain a valid + JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container + within a pod, this would take on a value like: "spec.containers{name}" + (where "name" refers to the name of the container that + triggered the event) or if no container name is specified + "spec.containers[2]" (container with index 2 in this + pod). This syntax is chosen only to have some well-defined + way of referencing a part of an object. TODO: this design + is not final and this field is subject to change in + the future.' + type: string + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + namespace: + description: 'Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/' + type: string + resourceVersion: + description: 'Specific resourceVersion to which this reference + is made, if any. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' + type: string + uid: + description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids' + type: string + type: object + type: array + preemptedPodRef: + description: PreemptedPodRef represents the Pod that be preempted + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: 'If referring to a piece of an object instead + of an entire object, this string should contain a valid + JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container + within a pod, this would take on a value like: "spec.containers{name}" + (where "name" refers to the name of the container that + triggered the event) or if no container name is specified + "spec.containers[2]" (container with index 2 in this pod). + This syntax is chosen only to have some well-defined way + of referencing a part of an object. TODO: this design + is not final and this field is subject to change in the + future.' + type: string + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + namespace: + description: 'Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/' + type: string + resourceVersion: + description: 'Specific resourceVersion to which this reference + is made, if any. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' + type: string + uid: + description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids' + type: string + type: object + type: object + type: array + reason: + description: Reason represents a brief CamelCase message indicating + details about why the PodMigrationJob is in this state. + type: string + state: + description: Status represents the current status of PodMigrationJob + e.g. ReservationCreated + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/config/crd/bases/scheduling.koordinator.sh_reservations.yaml b/config/crd/bases/scheduling.koordinator.sh_reservations.yaml index 74bb3f575..c19fdb95f 100644 --- a/config/crd/bases/scheduling.koordinator.sh_reservations.yaml +++ b/config/crd/bases/scheduling.koordinator.sh_reservations.yaml @@ -38,7 +38,10 @@ spec: name: v1alpha1 schema: openAPIV3Schema: - description: Reservation is the Schema for the reservation API + description: Reservation is the Schema for the reservation API. A Reservation + object is namespaced. But it can reserve resources for pods of any namespace. + Any namespaced affinity/anti-affinity of reservation scheduling can be specified + with the ObjectMeta. properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation @@ -55,14 +58,15 @@ spec: spec: properties: expires: - description: Expired timestamp when the reservation expires. `expires` - and `ttl` are mutually exclusive. Defaults to being set dynamically - at runtime based on the `ttl`. + description: Expired timestamp when the reservation is expected to + expire. If both `expires` and `ttl` are set, `expires` is checked + first. `expires` and `ttl` are mutually exclusive. Defaults to being + set dynamically at runtime based on the `ttl`. format: date-time type: string owners: description: Specify the owners who can allocate the reserved resources. - Multiple owner selectors and ANDed. + Multiple owner selectors and ORed. items: properties: controller: @@ -147,7 +151,7 @@ spec: type: object type: object object: - description: Multiple field selectors are ORed. + description: Multiple field selectors are ANDed. properties: apiVersion: description: API version of the referent. @@ -6994,12 +6998,32 @@ spec: type: object type: object ttl: + default: '''24h''' description: Time-to-Live period for the reservation. `expires` and - `ttl` are mutually exclusive. Defaults to 24h. + `ttl` are mutually exclusive. Defaults to 24h. Set 0 to disable + expiration. type: string type: object status: properties: + allocatable: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Resource reserved and allocatable for owners. + type: object + allocated: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Resource allocated by current owners. + type: object conditions: description: The `conditions` indicate the messages of reason why the reservation is still pending. @@ -7086,6 +7110,14 @@ spec: type: string type: object type: array + expired: + description: The `expired` indicates the timestamp if the reservation + is expired. + format: date-time + type: string + nodeName: + description: Name of node the reservation is scheduled on. + type: string phase: description: The `phase` indicates whether is reservation is waiting for process (`Pending`), available to allocate (`Available`) or diff --git a/config/crd/bases/slo.koordinator.sh_nodemetrics.yaml b/config/crd/bases/slo.koordinator.sh_nodemetrics.yaml index 43fb648f6..9e03c42eb 100644 --- a/config/crd/bases/slo.koordinator.sh_nodemetrics.yaml +++ b/config/crd/bases/slo.koordinator.sh_nodemetrics.yaml @@ -58,6 +58,36 @@ spec: properties: nodeUsage: properties: + devices: + items: + properties: + health: + description: Health indicates whether the device is + normal + type: boolean + id: + description: UUID represents the UUID of device + type: string + minor: + description: Minor represents the Minor number of Device, + starting from 0 + format: int32 + type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Resources is a set of (resource name, quantity) + pairs + type: object + type: + description: Type represents the type of device + type: string + type: object + type: array resources: additionalProperties: anyOf: @@ -81,6 +111,36 @@ spec: type: string podUsage: properties: + devices: + items: + properties: + health: + description: Health indicates whether the device is + normal + type: boolean + id: + description: UUID represents the UUID of device + type: string + minor: + description: Minor represents the Minor number of + Device, starting from 0 + format: int32 + type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Resources is a set of (resource name, + quantity) pairs + type: object + type: + description: Type represents the type of device + type: string + type: object + type: array resources: additionalProperties: anyOf: diff --git a/config/crd/bases/slo.koordinator.sh_nodeslos.yaml b/config/crd/bases/slo.koordinator.sh_nodeslos.yaml index 251d6a865..e7e64ca86 100644 --- a/config/crd/bases/slo.koordinator.sh_nodeslos.yaml +++ b/config/crd/bases/slo.koordinator.sh_nodeslos.yaml @@ -67,14 +67,14 @@ spec: format: int64 type: integer type: object - resourceQoSStrategy: + resourceQOSStrategy: description: QoS config strategy for pods of different qos-class properties: - be: - description: ResourceQoS for BE pods. + beClass: + description: ResourceQOS for BE pods. properties: - cpuQoS: - description: CPUQoSCfg stores node-level config of cpu qos + cpuQOS: + description: CPUQOSCfg stores node-level config of cpu qos properties: enable: description: Enable indicates whether the cpu qos is enabled. @@ -85,17 +85,17 @@ spec: format: int64 type: integer type: object - memoryQoS: - description: MemoryQoSCfg stores node-level config of memory + memoryQOS: + description: MemoryQOSCfg stores node-level config of memory qos properties: enable: description: 'Enable indicates whether the memory qos is enabled (default: false). This field is used for node-level control, while pod-level configuration is - done with MemoryQoS and `Policy` instead of an `Enable` - option. Please view the differences between MemoryQoSCfg - and PodMemoryQoSConfig structs.' + done with MemoryQOS and `Policy` instead of an `Enable` + option. Please view the differences between MemoryQOSCfg + and PodMemoryQOSConfig structs.' type: boolean lowLimitPercent: description: 'LowLimitPercent specifies the lowLimitFactor @@ -186,8 +186,8 @@ spec: minimum: 1 type: integer type: object - resctrlQoS: - description: ResctrlQoSCfg stores node-level config of resctrl + resctrlQOS: + description: ResctrlQOSCfg stores node-level config of resctrl qos properties: catRangeEndPercent: @@ -218,10 +218,10 @@ spec: type: object type: object cgroupRoot: - description: ResourceQoS for root cgroup. + description: ResourceQOS for root cgroup. properties: - cpuQoS: - description: CPUQoSCfg stores node-level config of cpu qos + cpuQOS: + description: CPUQOSCfg stores node-level config of cpu qos properties: enable: description: Enable indicates whether the cpu qos is enabled. @@ -232,17 +232,17 @@ spec: format: int64 type: integer type: object - memoryQoS: - description: MemoryQoSCfg stores node-level config of memory + memoryQOS: + description: MemoryQOSCfg stores node-level config of memory qos properties: enable: description: 'Enable indicates whether the memory qos is enabled (default: false). This field is used for node-level control, while pod-level configuration is - done with MemoryQoS and `Policy` instead of an `Enable` - option. Please view the differences between MemoryQoSCfg - and PodMemoryQoSConfig structs.' + done with MemoryQOS and `Policy` instead of an `Enable` + option. Please view the differences between MemoryQOSCfg + and PodMemoryQOSConfig structs.' type: boolean lowLimitPercent: description: 'LowLimitPercent specifies the lowLimitFactor @@ -333,8 +333,8 @@ spec: minimum: 1 type: integer type: object - resctrlQoS: - description: ResctrlQoSCfg stores node-level config of resctrl + resctrlQOS: + description: ResctrlQOSCfg stores node-level config of resctrl qos properties: catRangeEndPercent: @@ -364,11 +364,11 @@ spec: type: integer type: object type: object - ls: - description: ResourceQoS for LS pods. + lsClass: + description: ResourceQOS for LS pods. properties: - cpuQoS: - description: CPUQoSCfg stores node-level config of cpu qos + cpuQOS: + description: CPUQOSCfg stores node-level config of cpu qos properties: enable: description: Enable indicates whether the cpu qos is enabled. @@ -379,17 +379,17 @@ spec: format: int64 type: integer type: object - memoryQoS: - description: MemoryQoSCfg stores node-level config of memory + memoryQOS: + description: MemoryQOSCfg stores node-level config of memory qos properties: enable: description: 'Enable indicates whether the memory qos is enabled (default: false). This field is used for node-level control, while pod-level configuration is - done with MemoryQoS and `Policy` instead of an `Enable` - option. Please view the differences between MemoryQoSCfg - and PodMemoryQoSConfig structs.' + done with MemoryQOS and `Policy` instead of an `Enable` + option. Please view the differences between MemoryQOSCfg + and PodMemoryQOSConfig structs.' type: boolean lowLimitPercent: description: 'LowLimitPercent specifies the lowLimitFactor @@ -480,8 +480,8 @@ spec: minimum: 1 type: integer type: object - resctrlQoS: - description: ResctrlQoSCfg stores node-level config of resctrl + resctrlQOS: + description: ResctrlQOSCfg stores node-level config of resctrl qos properties: catRangeEndPercent: @@ -511,11 +511,11 @@ spec: type: integer type: object type: object - lsr: - description: ResourceQoS for LSR pods. + lsrClass: + description: ResourceQOS for LSR pods. properties: - cpuQoS: - description: CPUQoSCfg stores node-level config of cpu qos + cpuQOS: + description: CPUQOSCfg stores node-level config of cpu qos properties: enable: description: Enable indicates whether the cpu qos is enabled. @@ -526,17 +526,17 @@ spec: format: int64 type: integer type: object - memoryQoS: - description: MemoryQoSCfg stores node-level config of memory + memoryQOS: + description: MemoryQOSCfg stores node-level config of memory qos properties: enable: description: 'Enable indicates whether the memory qos is enabled (default: false). This field is used for node-level control, while pod-level configuration is - done with MemoryQoS and `Policy` instead of an `Enable` - option. Please view the differences between MemoryQoSCfg - and PodMemoryQoSConfig structs.' + done with MemoryQOS and `Policy` instead of an `Enable` + option. Please view the differences between MemoryQOSCfg + and PodMemoryQOSConfig structs.' type: boolean lowLimitPercent: description: 'LowLimitPercent specifies the lowLimitFactor @@ -627,8 +627,8 @@ spec: minimum: 1 type: integer type: object - resctrlQoS: - description: ResctrlQoSCfg stores node-level config of resctrl + resctrlQOS: + description: ResctrlQOSCfg stores node-level config of resctrl qos properties: catRangeEndPercent: @@ -658,11 +658,11 @@ spec: type: integer type: object type: object - system: - description: ResourceQoS for system pods + systemClass: + description: ResourceQOS for system pods properties: - cpuQoS: - description: CPUQoSCfg stores node-level config of cpu qos + cpuQOS: + description: CPUQOSCfg stores node-level config of cpu qos properties: enable: description: Enable indicates whether the cpu qos is enabled. @@ -673,17 +673,17 @@ spec: format: int64 type: integer type: object - memoryQoS: - description: MemoryQoSCfg stores node-level config of memory + memoryQOS: + description: MemoryQOSCfg stores node-level config of memory qos properties: enable: description: 'Enable indicates whether the memory qos is enabled (default: false). This field is used for node-level control, while pod-level configuration is - done with MemoryQoS and `Policy` instead of an `Enable` - option. Please view the differences between MemoryQoSCfg - and PodMemoryQoSConfig structs.' + done with MemoryQOS and `Policy` instead of an `Enable` + option. Please view the differences between MemoryQOSCfg + and PodMemoryQOSConfig structs.' type: boolean lowLimitPercent: description: 'LowLimitPercent specifies the lowLimitFactor @@ -774,8 +774,8 @@ spec: minimum: 1 type: integer type: object - resctrlQoS: - description: ResctrlQoSCfg stores node-level config of resctrl + resctrlQOS: + description: ResctrlQOSCfg stores node-level config of resctrl qos properties: catRangeEndPercent: diff --git a/docker/koord-manager.dockerfile b/docker/koord-manager.dockerfile new file mode 100644 index 000000000..31446f49e --- /dev/null +++ b/docker/koord-manager.dockerfile @@ -0,0 +1,18 @@ +FROM golang:1.17 as builder +WORKDIR /go/src/github.com/koordinator-sh/koordinator + +COPY go.mod go.mod +COPY go.sum go.sum + +RUN go mod download + +COPY apis/ apis/ +COPY cmd/ cmd/ +COPY pkg/ pkg/ + +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o koord-manager cmd/koord-manager/main.go + +FROM gcr.io/distroless/static:latest +WORKDIR / +COPY --from=builder /go/src/github.com/koordinator-sh/koordinator/koord-manager . +ENTRYPOINT ["/koord-manager"] diff --git a/docker/koord-scheduler.dockerfile b/docker/koord-scheduler.dockerfile new file mode 100644 index 000000000..d25a83685 --- /dev/null +++ b/docker/koord-scheduler.dockerfile @@ -0,0 +1,18 @@ +FROM golang:1.17 as builder +WORKDIR /go/src/github.com/koordinator-sh/koordinator + +COPY go.mod go.mod +COPY go.sum go.sum + +RUN go mod download + +COPY apis/ apis/ +COPY cmd/ cmd/ +COPY pkg/ pkg/ + +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o koord-scheduler cmd/koord-scheduler/main.go + +FROM gcr.io/distroless/static:latest +WORKDIR / +COPY --from=builder /go/src/github.com/koordinator-sh/koordinator/koord-scheduler . +ENTRYPOINT ["/koord-scheduler"] diff --git a/docker/koordlet.dockerfile b/docker/koordlet.dockerfile new file mode 100644 index 000000000..69f42702c --- /dev/null +++ b/docker/koordlet.dockerfile @@ -0,0 +1,18 @@ +FROM golang:1.17 as builder +WORKDIR /go/src/github.com/koordinator-sh/koordinator + +COPY go.mod go.mod +COPY go.sum go.sum + +RUN go mod download + +COPY apis/ apis/ +COPY cmd/ cmd/ +COPY pkg/ pkg/ + +RUN GOOS=linux GOARCH=amd64 go build -a -o koordlet cmd/koordlet/main.go + +FROM nvidia/cuda:11.6.1-base-ubuntu20.04 +WORKDIR / +COPY --from=builder /go/src/github.com/koordinator-sh/koordinator/koordlet . +ENTRYPOINT ["/koordlet"] diff --git a/docs/images/nvlink.jpg b/docs/images/nvlink.jpg new file mode 100644 index 000000000..5de9db126 Binary files /dev/null and b/docs/images/nvlink.jpg differ diff --git a/docs/proposals/scheduling/20220530-fine-grained-cpu-orchestration.md b/docs/proposals/scheduling/20220530-fine-grained-cpu-orchestration.md index f687ff40d..bbca4886e 100644 --- a/docs/proposals/scheduling/20220530-fine-grained-cpu-orchestration.md +++ b/docs/proposals/scheduling/20220530-fine-grained-cpu-orchestration.md @@ -11,7 +11,7 @@ reviewers: - "@stormgbs" - "@zwzhang0107" creation-date: 2022-05-30 -last-updated: 2022-06-24 +last-updated: 2022-07-11 status: provisional --- @@ -190,8 +190,8 @@ type ResourceSpec struct { type CPUBindPolicy string const ( - // CPUBindPolicyNone does not perform any bind policy - CPUBindPolicyNone CPUBindPolicy = "None" + // CPUBindPolicyDefault performs the default bind policy that specified in koord-scheduler configuration + CPUBindPolicyDefault CPUBindPolicy = "Default" // CPUBindPolicyFullPCPUs favor cpuset allocation that pack in few physical cores CPUBindPolicyFullPCPUs CPUBindPolicy = "FullPCPUs" // CPUBindPolicySpreadByPCPUs favor cpuset allocation that evenly allocate logical cpus across physical cores @@ -203,8 +203,8 @@ const ( type CPUExclusivePolicy string const ( - // CPUExclusivePolicyNone does not perform any exclusive policy - CPUExclusivePolicyNone CPUExclusivePolicy = "None" + // CPUExclusivePolicyDefault performs the default exclusive policy that specified in koord-scheduler configuration + CPUExclusivePolicyDefault CPUExclusivePolicy = "Default" // CPUExclusivePolicyPCPULevel represents mutual exclusion in the physical core dimension CPUExclusivePolicyPCPULevel CPUExclusivePolicy = "PCPULevel" // CPUExclusivePolicyNUMANodeLevel indicates mutual exclusion in the NUMA topology dimension @@ -213,13 +213,13 @@ const ( ``` - The `CPUBindPolicy` defines the CPU binding policy. The specific values are defined as follows: - - `CPUBindPolicyNone` or empty value does not perform any bind policy. It is completely determined by the scheduler plugin configuration. + - `CPUBindPolicyDefault` or empty value performs the default bind policy that specified in koord-scheduler configuration. - `CPUBindPolicyFullPCPUs` is a bin-packing policy, similar to the `full-pcpus-only=true` option defined by the kubelet, that allocate full physical cores. However, if the number of remaining logical CPUs in the node is sufficient but the number of full physical cores is insufficient, the allocation will continue. This policy can effectively avoid the noisy neighbor problem. - `CPUBindPolicySpreadByPCPUs` is a spread policy. If the node enabled Hyper-Threading, when this policy is adopted, the scheduler will evenly allocate logical CPUs across physical cores. For example, the current node has 8 physical cores and 16 logical CPUs. When a Pod requires 8 logical CPUs and the `CPUBindPolicySpreadByPCPUs` policy is adopted, the scheduler will allocate an logical CPU from each physical core. This policy is mainly used by some latency-sensitive applications with multiple different peak-to-valley characteristics. It can not only allow the application to fully use the CPU at certain times, but will not be disturbed by the application on the same physical core. So the noisy neighbor problem may arise when using this policy. - `CPUBindPolicyConstrainedBurst` a special policy that mainly helps K8s Burstable/Koordinator LS Pod get better performance. When using the policy, koord-scheduler is filtering out Nodes that have NUMA Nodes with suitable CPU Shared Pool by Pod Limit. After the scheduling is successful, the scheduler will update `scheduling.koordinator.sh/resource-status` in the Pod, declaring the `CPU Shared Pool` to be bound. The koordlet binds the CPU Shared Pool of the corresponding NUMA Node according to the `CPU Shared Pool` - If `kubelet.koordinator.sh/cpu-manager-policy` in `NodeResourceTopology` has option `full-pcpus-only=true`, or `node.koordinator.sh/cpu-bind-policy` in the Node with the value `PCPUOnly`, the koord-scheduler will check whether the number of CPU requests of the Pod meets the `SMT-alignment` requirements, so as to avoid being rejected by the kubelet after scheduling. koord-scheduler will avoid such nodes if the Pod uses the `CPUBindPolicySpreadByPCPUs` policy or the number of logical CPUs mapped to the number of physical cores is not an integer. - The `CPUExclusivePolicy` defines the CPU exclusive policy, it can help users to avoid noisy neighbor problems. The specific values are defined as follows: - - `CPUExclusivePolicyNone` or empty value does not perform any isolate policy. It is completely determined by the scheduler plugin configuration. + - `CPUExclusivePolicyDefault` or empty value performs the default exclusive policy that specified in koord-scheduler configuration. - `CPUExclusivePolicyPCPULevel`. When allocating logical CPUs, try to avoid physical cores that have already been applied for by the same exclusive policy. It is a supplement to the `CPUBindPolicySpreadByPCPUs` policy. - `CPUExclusivePolicyNUMANodeLevel`. When allocating logical CPUs, try to avoid NUMA Nodes that has already been applied for by the same exclusive policy. If there is no NUMA Node that satisfies the policy, downgrade to `PCPU` policy. @@ -646,10 +646,8 @@ The following is an approximate brief algorithm logic: type CPUOrchestrationPluginArgs struct { metav1.TypeMeta - PreferredCPUBindPolicy CPUBindPolicy `json:"preferredCPUBindPolicy,omitempty"` - PreferredCPUExclusivePolicy CPUExclusivePolicy `json:"preferredCPUExclusivePolicy,omitempty"` + DefaultCPUBindPolicy CPUBindPolicy `json:"defaultCPUBindPolicy,omitempty"` NUMATopologyAlignmentPolicy NUMATopologyAlignmentPolicy `json:"numaTopologyAlignmentPolicy,omitempty"` - NUMAAllocateStrategy NUMAAllocateStrategy `json:"numaAllocateStrategy,omitempty"` ScoringStrategy ScoringStrategy `json:"scoringStrategy,omitempty"` } @@ -677,11 +675,9 @@ type ScoringStrategy struct { } ``` -- `CPUBindPolicy` represents the default bind policy. If not set, use `PCPUFirst` as default value. -- `CPUExclusivePolicy` represents the default exclusive policy. There is no default value. +- `DefaultCPUBindPolicy` represents the default bind policy. If not set, use `FullPCPUs` as default value. - `NUMATopologyAlignmentPolicy` represents the default NUMA topology alignment policy, If not set, use `BestEffort` as default value. - `ScoringStrategy` represents the node resource scoring strategy. If not set, use `MostAllocated` as default value. -- `NUMAAllocateStrategy` represents the default NUMA allocate strategy. If not set, use `MostAllocated` as default value. ## Alternatives @@ -702,3 +698,4 @@ type ScoringStrategy struct { - Add details about how to process newly created K8s Guaranteed Pod - Support Burstable Pod staticly bind CPU - 2022-06-24: Fix typo +- 2022-07-11: Adjust CPUBindPolicyNone to CPUBindPolicyDefault \ No newline at end of file diff --git a/docs/proposals/scheduling/20220609-resource-reservation.md b/docs/proposals/scheduling/20220609-resource-reservation.md index dd75ac902..3f76f5c07 100644 --- a/docs/proposals/scheduling/20220609-resource-reservation.md +++ b/docs/proposals/scheduling/20220609-resource-reservation.md @@ -8,7 +8,7 @@ reviewers: - "@jasonliu747" - "@zwzhang0107" creation-date: 2022-06-09 -last-updated: 2022-06-20 +last-updated: 2022-07-20 --- # Resource Reservation @@ -121,7 +121,7 @@ type ReservationSpec struct { // the specified node. Template *corev1.PodTemplateSpec `json:"template,omitempty"` // Specify the owners who can allocate the reserved resources. - // Multiple owner selectors and ANDed. + // Multiple owner selectors and ORed. Owners []ReservationOwner `json:"owners,omitempty"` // By default, the resources requirements of reservation (specified in `template.spec`) is filtered by whether the // node has sufficient free resources (i.e. ReservationRequest < NodeFree). @@ -130,7 +130,7 @@ type ReservationSpec struct { PreAllocation bool `json:"preAllocation,omitempty"` // Time-to-Live period for the reservation. // `expires` and `ttl` are mutually exclusive. If both `ttl` and `expires` are not specified, a very - // long TTL will be picked as default. + // long TTL will be picked as default. Set 0 to disable the expiration. TTL *metav1.Duration `json:"ttl,omitempty"` // Expired timestamp when the reservation expires. // `expires` and `ttl` are mutually exclusive. Defaults to being set dynamically at runtime based on the `ttl`. @@ -141,14 +141,22 @@ type ReservationStatus struct { // The `phase` indicates whether is reservation is waiting for process (`Pending`), available to allocate // (`Available`) or expired to get cleanup (Expired). Phase ReservationPhase `json:"phase,omitempty"` + // The `expired` indicates the timestamp if the reservation is expired. + Expired *metav1.Time `json:"expired,omitempty"` // The `conditions` indicate the messages of reason why the reservation is still pending. Conditions []ReservationCondition `json:"conditions,omitempty"` // Current resource owners which allocated the reservation resources. CurrentOwners []corev1.ObjectReference `json:"currentOwners,omitempty"` + // Name of node the reservation is scheduled on. + NodeName string `json:"nodeName,omitempty"` + // Resource reserved and allocatable for owners. + Allocatable corev1.ResourceList `json:"allocatable,omitempty"` + // Resource allocated by current owners. + Allocated corev1.ResourceList `json:"allocated,omitempty"` } type ReservationOwner struct { - // Multiple field selectors are ORed. + // Multiple field selectors are ANDed. Object *corev1.ObjectReference `json:"object,omitempty"` Controller *ReservationControllerReference `json:"controller,omitempty"` LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"` @@ -212,12 +220,14 @@ Let's call the reservation is *allocatable* for a pod if: When the reservation plugin is enabled, the scheduler checks for every scheduling pod if there are allocatable reservations on a node. With a `Score` plugin implemented, the scheduler prefers pods to schedule on nodes which have more allocatable reserved resources. -When a pod is scheduled on a node with allocatable reservations, it allocates resources belonging to one of reservations. (TBD: To pick one of reservations, we might choose the first coming matched one.) +When a pod is scheduled on a node with allocatable reservations, it allocates resources belonging to one of reservations. To pick one of reservations, we choose the one which can get most reserved resources allocated (i.e. MostAllocated). And the scheduler also annotates the pod with the reservation info. ##### Expiration and Cleanup When a reservation has been created for a long time exceeding the `TTL` or `Expires`, the scheduler updates its status as `Expired`. For expired reservations, the scheduler will cleanup them with a custom garbage collection period. +When a node is deleted, the available and waiting reservations on the node should be marked as `Expired` since they are not allocatable any more. + #### Use Cases To generally reserve node resources, submit a `Reservation` and set the pod template in the field `spec.template`. Then the koord-scheduler will update this `Reservation` with the scheduling result and the resources will get reserved. @@ -262,7 +272,8 @@ Reserving resources with [`pause` pods with very low assigned priority](https:// ## Implementation History - [X] 06/09/2022: Open PR for initial draft -- [ ] 06/14/2022: Sent proposal for review +- [X] 06/14/2022: Sent proposal for review +- [ ] 07/20/2022: Update design details ## References diff --git a/docs/proposals/scheduling/20220629-fine-grained-device-scheduling.md b/docs/proposals/scheduling/20220629-fine-grained-device-scheduling.md new file mode 100644 index 000000000..45f054533 --- /dev/null +++ b/docs/proposals/scheduling/20220629-fine-grained-device-scheduling.md @@ -0,0 +1,419 @@ +--- +title: Fine-grained Device Scheduling +authors: +- "@buptcozy" +co-authors: +- "@eahydra" +reviewers: +- "@eahydra" +- "@hormes" +- "@yihuifeng" +- "@honpey" +- "@zwzhang0107" +- "@jasonliu747" +creation-date: 2022-06-29 +last-updated: 2022-07-18 +status: provisional + +--- + +# Fine-grained Device Scheduling + + + +- [Fine-grained Device Scheduling](#fine-grained-device-scheduling) + - [Summary](#summary) + - [Motivation](#motivation) + - [Goals](#goals) + - [Non-goals/Future work](#non-goalsfuture-work) + - [Proposal](#proposal) + - [API](#api) + - [Device resource dimensions](#device-resource-dimensions) + - [User apply device resources scenarios](#user-apply-device-resources-scenarios) + - [Compatible with nvidia.com/gpu](#compatible-with-nvidiacomgpu) + - [Apply whole resources of GPU or part resources of GPU](#apply-whole-resources-of-gpu-or-part-resources-of-gpu) + - [Apply koordinator.sh/gpu-core and koordinator.sh/gpu-memory-ratio separately](#apply-koordinatorshgpu-core-and-koordinatorshgpu-memory-ratio-separately) + - [Apply koordinator.sh/gpu-core and koordinator.sh/gpu-memory separately](#apply-koordinatorshgpu-core-and-koordinatorshgpu-memory-separately) + - [Apply RDMA](#apply-rdma) + - [Implementation Details](#implementation-details) + - [Scheduling](#scheduling) + - [DeviceAllocation](#deviceallocation) + - [NodeDevicePlugin](#nodedeviceplugin) + - [Device Reporter](#device-reporter) + - [koordlet and koord-runtime-proxy](#koordlet-and-koord-runtime-proxy) + - [Compatibility](#compatibility) + - [Unsolved Problems](#unsolved-problems) + - [Alternatives](#alternatives) + - [Implementation History](#implementation-history) + - [References](#references) + + + +## Summary + +This proposal provides a fine-grained mechanism for managing GPUs and other devices such as RDMA and FPGA, defines a set of APIs to describe device information on nodes, including GPU, RDMA, and FPGA, and a new set of resource names to flexibly support users to apply at a finer granularity GPU resources. This mechanism is the basis for subsequent other GPU scheduling capabilities such as GPU Share, GPU Overcommitment, etc. + +## Motivation + +GPU devices have very strong computing power, but are expensive. How to make better use of GPU equipment, give full play to the value of GPU and reduce costs is a problem that needs to be solved. In the existing GPU allocation mechanism of the K8s community, the GPU is allocated by the kubelet, and it is a complete device allocation. This method is simple and reliable, but similar to the CPU and memory, the GPU will also be wasted. Therefore, some users expect to use only a portion of the GPU's resources and share the rest with other workloads to save costs. Moreover, GPU has particularities. For example, the NVLink and oversold scenarios supported by NVIDIA GPU mentioned below both require a central decision through the scheduler to obtain globally optimal allocation results. + +![image](/docs/images/nvlink.jpg) + +From the picture, we can find that although the node has 8 GPU instances whose model is A100/V100, the data transmission speed between GPU instances is different. When a Pod requires multiple GPU instances, we can assign the Pod the GPU instances with the maximum data transfer speed combined relationship. In addition, when we want the GPU instances among a group of Pods to have the maximum data transfer speed combined relationship, the scheduler should batch allocate the best GPU instances to these Pods and assign them to the same node. + +### Goals + +1. Definition Device CRD and the Resource API. +1. Provides a reporter component in koordlet to report Device information and resource capacities. +1. Provides a scheduler plugin to support users to apply at a finer granularity GPU resources. +1. Provider a new runtime hook plugin in koordlet to support update the environments of containers with GPUs that be allocated by scheduler. + +### Non-goals/Future work + +1. Define flexible allocation strategies, such as implementing BinPacking or Spread according to GPU resources + +## Proposal + +### API + +#### Device resource dimensions + +Due to GPU is complicated, we will introduce GPU first. As we all know there is compute and GPU Memory capability for the GPU device. Generally user apply GPU like "I want 1/2/4/8 GPUs", but if node support GPU level isolation mechanism, user may apply GPU like "I want 0.5/0.25 GPU resources". Moreover, user may set different compute capability and GPU memory capability for best resource utilization, so the user want apply GPU like "I want X percent of "compute capability and Y percent of memory capability". + +We abstract GPU resources into different dimensions: + +- `koordinator.sh/gpu-core` represents the computing capacity of the GPU. Similar to K8s MilliCPU, we abstract the total computing power of GPU into one hundred, and users can apply for the corresponding amount of GPU computing power according to their needs. +- `koordinator.sh/gpu-memory` represents the memory capacity of the GPU in bytes. +- `koordinator.sh/gpu-memory-ratio` represents the percentage of the GPU's memory. + +Assuming that node A has 4 GPU instances, and the total memory of each instance is 8GB, when device reporter reports GPU capacity information to `Node.Status.Allocatable`, it no longer reports nvidia.com/gpu=4, but reports the following information: + +```yaml +status: + capacity: + koordinator.sh/gpu-core: 400 + koordinator.sh/gpu-memory: "32GB" + koordinator.sh/gpu-memory-ratio: 400 + allocatable: + koordinator.sh/gpu-core: 400 + koordinator.sh/gpu-memory: "32GB" + koordinator.sh/gpu-memory-ratio: 400 +``` + +For the convenience of users, an independent resource name `koordinator.sh/gpu` is defined. For example, when a user wants to use half of the computing resources and memory resources of a GPU instance, the user can directly declare `koordinator.sh/gpu: 50`, and the scheduler will convert it to `koordinator.sh/gpu-core: 50, koordinator.sh/gpu-memory-ratio: 50` + +For other devices like RDMA and FPGA, the node has 1 RDMA and 1 FGPA, will report the following information: + +```yaml +status: + capacity: + koordinator.sh/rdma: 100 + koordinator.sh/fpga: 100 + allocatable: + koordinator.sh/rdma: 100 + koordinator.sh/fpga: 100 +``` + +Why do we need `koordinator.sh/gpu-memory-ratio` and `koordinator.sh/gpu-memory` ? +When user apply 0.5/0.25 GPU, the user don't know the exact memory total bytes per GPU, only wants to use +half or quarter percentage of memory, so user can request the GPU memory with `koordinator.sh/gpu-memory-ratio`. +When scheduler assigned Pod on concrete node, scheduler will translate the `koordinator.sh/gpu-memory-ratio` to `koordinator.sh/gpu-memory` by the formulas: ***allocatedMemory = totalMemoryOf(GPU) * `koordinator.sh/gpu-memory-ratio`***, so that the GPU isolation can work. + +During the scheduling filter phase, the scheduler will do special processing for `koordinator.sh/gpu-memory` and `koordinator.sh/gpu-memory-ratio`. When a Pod specifies `koordinator.sh/gpu-memory-ratio`, the scheduler checks each GPU instance on each node for unallocated or remaining resources to ensure that the remaining memory on each GPU instance meets the ratio requirement. + +If the user knows exactly or can roughly estimate the specific memory consumption of the workload, he can apply for GPU memory through `koordinator.sh/gpu-memory`. All details can be seen below. + +Besides, when dimension's value > 100, means Pod need multi-devices. now only allow the value can be divided by 100. + +#### User apply device resources scenarios + +##### Compatible with `nvidia.com/gpu` + +```yaml +resources: + requests: + nvidia.com/gpu: "2" + cpu: "4" + memory: "8Gi" +``` + +The scheduler translates the `nvida.com/gpu: 2` to the following spec: + +```yaml +resources: + requests: + koordinator.sh/gpu-core: "200" + koordinator.sh/gpu-memory-ratio: "200" + koordinator.sh/gpu-memory: "16Gi" # assume 8G memory in bytes per GPU + cpu: "4" + memory: "8Gi" +``` + +##### Apply whole resources of GPU or part resources of GPU + +```yaml +resources: + requests: + koordinator.sh/gpu: "50" + cpu: "4" + memory: "8Gi" +``` + +The scheduler translates the `koordinator.sh/gpu: "50"` to the following spec: + +```yaml +resources: + requests: + koordinator.sh/gpu-core: "50" + koordinator.sh/gpu-memory-ratio: "50" + koordinator.sh/gpu-memory: "4Gi" # assume 8G memory in bytes for the GPU + cpu: "4" + memory: "8Gi" +``` + +##### Apply `koordinator.sh/gpu-core` and `koordinator.sh/gpu-memory-ratio` separately + +```yaml +resources: + requests: + koordinator.sh/gpu-core: "50" + koordinator.sh/gpu-memory-ratio: "60" + cpu: "4" + memory: "8Gi" +``` + +##### Apply `koordinator.sh/gpu-core` and `koordinator.sh/gpu-memory` separately + +```yaml +resources: + requests: + koordinator.sh/gpu-core: "60" + koordinator.sh/gpu-memory: "4Gi" + cpu: "4" + memory: "8Gi" +``` + +##### Apply RDMA + +```yaml +resources: + requests: + koordinator.sh/rdma: "100" + cpu: "4" + memory: "8Gi" +``` + +### Implementation Details + +#### Scheduling + +1. Abstract new data structure to describe resources and healthy status per device on the node. +2. Implements the Filter/Reserve/PreBind extenstion points. +3. Automatically recognize different kind devices. When a new device added, we don't need modify any code + +##### DeviceAllocation + +In the PreBind stage, the scheduler will update the device (including GPU) allocation results, including the device's Minor and resource allocation information, to the Pod in the form of annotations. + +```go +/* +{ + "gpu": [ + { + "minor": 0, + "resouurces": { + "koordinator.sh/gpu-core": 100, + "koordinator.sh/gpu-mem-ratio": 100, + "koordinator.sh/gpu-mem": "16Gi" + } + }, + { + "minor": 1, + "resouurces": { + "koordinator.sh/gpu-core": 100, + "koordinator.sh/gpu-mem-ratio": 100, + "koordinator.sh/gpu-mem": "16Gi" + } + } + ] +} +*/ +type DeviceAllocation struct { + Minor int32 + Resources map[string]resource.Quantity +} + +type DeviceAllocations map[DeviceType][]*DeviceAllocation +``` + +##### NodeDevicePlugin + +```go +var ( + _ framework.FilterPlugin = &NodeDevicePlugin{} + _ framework.ReservePlugin = &NodeDevicePlugin{} + _ framework.PreBindPlugin = &NodeDevicePlugin{} +) + +type NodeDevicePlugin struct { + frameworkHandler framework.Handle + deviceClient deviceClient.Interface + deviceLister devicelister.DeviceLister + nodeLister listerv1.NodeLister + nodeDeviceCache *NodeDeviceCache +} + +type NodeDeviceCache struct { + nodeDevices map[string]*nodeDevice +} + +type nodeDevice struct { + DeviceTotal map[DeviceType]*deviceResource + DeviceFree map[DeviceType]*deviceResource + DeviceUsed map[DeviceType]*deviceResource + AllocateSet map[string]*PodInfo +} + +// We use `deviceResource` to present resources per device. +// "0": {koordinator.sh/gpu-core:100, koordinator.sh/gpu-memory-ratio:100, koordinator.sh/gpu-memory: 16GB} +// "1": {koordinator.sh/gpu-core:100, koordinator.sh/gpu-memory-ratio:100, koordinator.sh/gpu-memory: 16GB} +type deviceResource struct { + // key is the minor of device + DeviceKeyValueMap map[int32]map[string]resource.Quantity +} + +``` + +We will register node and device event handler to maintain device account. + +In Filter, we will make-up each device request by a node(the gpu-memory example), and try compare each device free +resource and pod device request. + +In Reserve/UnReserve, we will update nodeDeviceCache's used/free resource and allocateSet. Now device selection rule just based on device minor id order. + +In PreBind, we will write DeviceAllocations to Pod's annotation. + +In init stage, we should list all Node/Device/Pods to recover device accounts. + +#### Device Reporter + +Implements a new component called `Device Reporter` in koordlet to create or update `Device` CRD instance with the resources information and healthy status per device including GPU, RDMA and FPGA, etc. This version we only support GPU. It will execution `nccl` commands to get each minor resource just like k8s-gpu-device-plugins. We will apply community health check logic. + +```go +type DeviceType string + +const ( + GPU DeviceType = "gpu" + FPGA DeviceType = "fpga" + RDMA DeviceType = "rdma" +) + +type DeviceSpec struct { + Devices []DeviceInfo `json:"devices"` +} + +type DeviceInfo struct { + // UUID represents the UUID of device + UUID string `json:"id,omitempty"` + // Minor represents the Minor number of Device, starting from 0 + Minor int32 `json:"minor,omitempty"` + // Type represents the type of device + Type DeviceType `json:"deviceType,omitempty"` + // Health indicates whether the device is normal + Health bool `json:"health,omitempty"` + // Resources represents the total capacity of various resources of the device + Resources map[string]resource.Quantity `json:"resource,omitempty"` +} + +type DeviceStatus struct { + Allocations []DeviceAllocation `json:"allocations"` +} + +type DeviceAllocation struct { + Type DeviceType `json:"type"` + Entries []DeviceAllocationItem `json:"entries"` +} + +type DeviceAllocationItem struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + UUID string `json:"uuid"` + Devices []string `json:"devices"` +} + +type Device struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec DeviceSpec `json:"spec,omitempty"` + Status DeviceStatus `json:"status,omitempty"` +} + +type DeviceList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + + Items []Device `json:"items"` +} +``` + +#### koordlet and koord-runtime-proxy + +Our target is to work compatible with origin k8s kubelet and k8s device plugins, so: +1. We still allow kubelet and device plugin to allocate concrete device, which means no matter there's a k8s device +plugin or not, our design can work well. +2. In koord-runtime-proxy, we will use Pod's `DeviceAllocation` in annotation to replace the step1's result of container's +args and envs. + +We should modify protocol between koord-runtime-proxy and koordlet to add container env: + +```go +type ContainerResourceHookRequest struct { + .... + Env map[string]string +} + +type ContainerResourceHookResponse struct { + .... + Env map[string]string +} +``` + +Then we will add a new `gpu-hook` in koordlet's runtimehooks. It's used in `PreStartContainer` and `PostContainerstage`. +We will generate new GPU env `NVIDIA_VISIBLE_DEVICES` by Pod GPU allocation result in annotation. + +When we handle hot-update processing, we can handle the existing scheduled Pods without device allocation in pod's annotation. If GPU allocation info is not in annotation, we will find the GPU allocations from `ContainerResourceHookRequest`'s `Env`, and we will update all GPU allocations to Device CRD instance. + +The koord-runtime-proxy can see these Pod's env, we need koord-runtime-proxy to pass these environments to koordlet, and koordlet parse the GPU related env to find the concrete device ids. Then koordlet will update these information to Device CRD intance, which help scheduler to recover the deviceCache. + +Besides, the koordlet should report GPU model to node labels same as device plugin, this is in-case koordinator working without device-plugin. + +Finally, we should modify `ContainerResourceExecutor`'s `UpdateRequest` function in koord-runtime-proxy, and let new GPU env covering old GPU env. + +### Compatibility + +As we know, the GPU scheduling in kube-scheduler side has no any different with other scalar resources. The concrete +device-level assigning is done by kubelet and GPU device plugin, which will generate container's GPU env. + +Our design has no conflict with the above process. our device reporter will report koordinator GPU resources for kubelet +updating node resources. Then we schedule device request in our new plugin with new device resource account. In pre-bind +stage, we will update container resources with koordinator GPU resources, this is for kubelet to check resource limitation. +We will also add device allocation information to pod's annotation. In node side, the k8s device plugin will first patch +container env, but we will overwrite these envs in runtimeproxy by allocation result in pod's annotation. + +If there is no GPU device plugin, the only different is in pre-bind stage, we will erase the origin-gpu-resource-dimension +in container to pass kubelet resource limitation check. This will be a flag in scheduler side. + +## Unsolved Problems + +## Alternatives + +1. User can choose whether use k8s-device plugin. as mentioned above, we can compatible in both cases. + +## Implementation History + +- 2022-06-29: Initial proposal +- 2022-07-08: Refactor proposal for review +- 2022-07-18: Fix Device CRD definition + +## References \ No newline at end of file diff --git a/docs/proposals/scheduling/20220701-pod-migration-job.md b/docs/proposals/scheduling/20220701-pod-migration-job.md new file mode 100644 index 000000000..1aee46fbe --- /dev/null +++ b/docs/proposals/scheduling/20220701-pod-migration-job.md @@ -0,0 +1,399 @@ +--- +title: Pod Migration Job +authors: + - "@eahydra" +reviewers: + - "@hormes" + - "@allwmh" + - "@jasonliu747" + - "@saintube" + - "@stormgbs" + - "@zwzhang0107" +creation-date: 2022-07-01 +last-updated: 2022-07-13 +status: provisional +--- + +# Pod Migration Job + +## Table of Contents + + +- [Pod Migration Job](#pod-migration-job) + - [Table of Contents](#table-of-contents) + - [Glossary](#glossary) + - [Summary](#summary) + - [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals/Future Work](#non-goalsfuture-work) + - [Proposal](#proposal) + - [User Stories](#user-stories) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Story 3](#story-3) + - [Basic Migration API](#basic-migration-api) + - [Pod Migration Job CRD](#pod-migration-job-crd) + - [Migration Job Spec](#migration-job-spec) + - [Migration Job Status](#migration-job-status) + - [Implementation Details/Notes/Constraints](#implementation-detailsnotesconstraints) + - [PodMigrationJob Controller](#podmigrationjob-controller) + - [Group PodMigrationJob](#group-podmigrationjob) + - [Filter PodMigrationJob](#filter-podmigrationjob) + - [Sort PodMigrationJob](#sort-podmigrationjob) + - [Execute PodMigrationJob](#execute-podmigrationjob) + - [Migration Stability mechanism](#migration-stability-mechanism) + - [Controller Configuration](#controller-configuration) + - [Alternatives](#alternatives) + - [Implementation History](#implementation-history) + + +## Glossary + +## Summary + +This proposal defines a CRD-based Pod migration API, through which the descheduler or other automatic fault recovery components can evict or delete Pods more safely. At the same time, the proposal also describes the specific implementation details of the API. + +## Motivation + +Migrating Pods is an important capability that many components (such as deschedulers) rely on, and can be used to optimize scheduling or help resolve workload runtime quality issues. We believe that pod migration is a complex process, involving steps such as auditing, resource allocation, and application startup, and is mixed with application upgrading, scaling scenarios, and resource operation and maintenance operations by cluster administrators. Therefore, how to manage the stability risk of this process to ensure that the application does not fail due to the migration of Pods is a very critical issue that must be resolved. + +Therefore, it is necessary to realize a final state-oriented migration capability based on CRD, track the status of each process in the migration, and perceive scenarios such as upgrading and scaling of the application. + +### Goals + +1. Defines a CRD-based Pod Migration Job API, through which the descheduler can evict or delete Pods more safely. +2. Describe in detail the design details behind the API. + +### Non-Goals/Future Work + +1. A new descheduler framework +2. Descheduling capability for different scenarios such as load-aware descheduling, defragemention, etc. +3. The details about Deterministic preemption that preempts other Pods for Reservation. + +## Proposal + +### User Stories + +#### Story 1 + +The descheduler in the K8s community evicts pods to be rescheduled according to different strategies. However, it does not guarantee whether the evicted Pod has resources available after re-creation. If a large number of new Pods are in the Pending state when the resources in the cluster are tight, may lower the application availabilities. + +#### Story 2 + +The descheduler evicts the Pod through the Eviction API, and the Eviction API decides whether to delete the Pod according to the PDB status. However, it is unable to perceive workload upgrades, scaling and other scenarios in which Pods are deleted, which will also bring security risks. + +#### Story 3 + +The Pod migration capability itself can be provided to users as a service. Users can integrate this API in their own systems to achieve safe migration, and are no longer limited to deschedulers. + + +### Basic Migration API + +These APIs provide cluster administrators with more fine-grained migration control capabilities, which can better reduce risks. + +- `scheduling.koordinator.sh/eviction-cost` indicates the eviction cost. It can be used to set to an int32. The implicit eviction cost for pods that don't set the annotation is 0, negative values are permitted. If set the cost ith `math.MaxInt32`, it means the Pod will not be evicted. Pods with lower eviction cost are preferred to be evicted before pods with higher eviction cost. If a batch of Pods to be evicted have the same priority, they will be sorted by cost, and the Pod with the smallest cost will be evicted. Although the K8s community has [Pod Deletion Cost #2255](https://github.com/kubernetes/enhancements/issues/2255), it is not a general mechanism. To avoid conflicts with components that use `Pod Deletion Cost`, users can individually mark the eviction cost for Pods. + + +### Pod Migration Job CRD + +In order to support the above user stories, a Custom Resource Definition(CRD) named `PodMigrationJob` is proposed to ensure the migration process safely. + +#### Migration Job Spec + +```go + +// PodMigrationJob is the Schema for the PodMigrationJob API +// +k8s:openapi-gen=true +// +kubebuilder:resource:scope=Cluster +type PodMigrationJob struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + Spec PodMigrationJobSpec `json:"spec,omitempty"` + Status PodMigrationJobStatus `json:"status,omitempty"` +} + +type PodMigrationJobSpec struct { + // Paused indicates whether the PodMigrationJob should to work or not. + // Default is false + // +optional + Paused bool `json:"paused,omitempty"` + + // TTL controls the PodMigrationJob timeout duration. + // +optional + TTL *metav1.Duration `json:"ttl,omitempty"` + + // Mode represents the operating mode of the Job + // Default is PodMigrationJobModeReservationFirst + // +optional + Mode PodMigrationJobMode `json:"mode,omitempty"` + + // PodRef represents the Pod that be migrated + // +required + PodRef *corev1.ObjectReference `json:"podRef"` + + // ReservationOptions defines the Reservation options for migrated Pod + // +optional + ReservationOptions *PodMigrateReservationOptions `json:"reservationOptions,omitempty"` + + // DeleteOptions defines the deleting options for the migrated Pod and preempted Pods + // +optional + DeleteOptions *metav1.DeleteOptions `json:"deleteOptions,omitempty"` +} + +type PodMigrationJobMode string + +const ( + PodMigrationJobModeReservationFirst PodMigrationJobMode = "ReservationFirst" + PodMigrationJobModeEvictionDirectly PodMigrationJobMode = "EvictDirectly" +) + +type PodMigrateReservationOptions struct { + // ReservationRef if specified, PodMigrationJob will check if the status of Reservation is available. + // ReservationRef if not specified, PodMigrationJob controller will create Reservation by Template, + // and update the ReservationRef to reference the Reservation + // +optional + ReservationRef *corev1.ObjectReference `json:"reservationRef,omitempty"` + + // Template is the object that describes the Reservation that will be created if not specified ReservationRef + // +optional + Template *ReservationTemplateSpec `json:"template,omitempty"` + + // PreemptionOption decides whether to preempt other Pods. + // The preemption is safe and reserves resources for preempted Pods. + // +optional + PreemptionOptions *PodMigrationJobPreemptionOptions `json:"preemptionOptions,omitempty"` +} + +type PodMigrationJobPreemptionOptions struct { + // Reserved object. +} +``` + +- `Paused` indicates whether the PodMigrationJob should to work or not. In some scenarios, the user does not expect the PodMigrationJob Controller to process the PodMigrationJob immediately, but rather to decide whether to execute it after completing some operations similar to auditing. +- `TimeoutInSeconds` controls the PodMigrationJob timeout duration. +- The `PodMigrationJob` support two modes defined by the field `Mode`: + - `PodMigrationJobModeReservationFirst` means that before migrating a Pod, try to reserve resources through the `Reservation` API, delete the Pod to be migrated after successfully reserved, and observe the status of the `Reservation` to ensure that the `Reservation` is consumed. + - `PodMigrationJobModeEvictionDirectly` indicates that the user clearly knows the risk of evicting the Pod and decides to evict the Pod directly. + - If `Mode` is not specified, `PodMigrationJobModeReservationFirst` is used by default +- `PodRef` represents the Pod that be migrated. The field is required. +- `ReservationOptions` defines options for how to reserve resource through `Reservation` API: + - `ReservationRef` if is specified, the referenced `Reservation` instance is used first. In some scenarios, such as defragmentation, in order to ensure the reliability of the upper-layer logic, resources may have been reserved on the target node. In this case, the specified `Reservation` can be used directly. + - `Template` describes the spec of `Reservation`. It is often not necessary to set this field. When neither `ReservationRef` nor `Template` is specified, the `PodMigrationJob controller` will construct the `ReservationSpec` reserved resources according to the Spec of the migrated Pod. If `Template` is set, the `ReservationTemplateSpec` and the Spec of the migrated Pod will be merged to construct the `ReservationSpec` reserved resources. + - `PreemptionOptions` decides whether to preempt other Pods if reserved resources failed. The specific details of preemption will be submitted in a separate proposal description in future work, and will not be expanded here for the time being. +- `DeleteOptions` defines the options of delete operation. Whether to delete a Pod through the `K8s Delete API` or evict a Pod through the `K8s Eviction API` depends on how the user configures the parameters of the `PodMigrationJob Controller`. Users only need to set `DeleteOptions` according to the workload in their own cluster. + +#### Migration Job Status + +```go +type PodMigrationJobStatus struct { + // PodMigrationJobPhase represents the phase of a PodMigrationJob is a simple, high-level summary of where the PodMigrationJob is in its lifecycle. + // e.g. Pending/Running/Failed + Phase PodMigrationJobPhase `json:"phase,omitempty"` + // Status represents the current status of PodMigrationJob + // e.g. ReservationCreated + Status string `json:"state,omitempty"` + // Reason represents a brief CamelCase message indicating details about why the PodMigrationJob is in this state. + Reason string `json:"reason,omitempty"` + // Message represents a human-readable message indicating details about why the PodMigrationJob is in this state. + Message string `json:"message,omitempty"` + // Conditions records the stats of PodMigrationJob + Conditions []PodMigrationJobCondition `json:"conditions,omitempty"` + // NodeName represents the node's name of migrated Pod + NodeName string `json:"nodeName,omitempty"` + // PodsRef represents the newly created Pods after being migrated + PodsRef []corev1.ObjectReference `json:"podsRef,omitempty"` + // PreemptedPodsRef represents the Pods that be preempted + PreemptedPodsRef []corev1.ObjectReference `json:"preemptedPodsRef,omitempty"` + // PreemptedPodsReservations records information about Reservations created due to preemption + PreemptedPodsReservations []PodMigrationJobPreemptedReservation `json:"preemptedPodsReservation,omitempty"` +} + +type PodMigrationJobPreemptedReservation struct { + // Namespace represents the namespace of Reservation + Namespace string `json:"namespace,omitempty"` + // Name represents the name of Reservation + Name string `json:"name,omitempty"` + // NodeName represents the assigned node for Reservation by scheduler + NodeName string `json:"nodeName,omitempty"` + // Phase represents the Phase of Reservation + Phase string `json:"phase,omitempty"` + // PreemptedPodRef represents the Pod that be preempted + PreemptedPodRef *corev1.ObjectReference `json:"preemptedPodRef,omitempty"` + // PodsRef represents the newly created Pods after being preempted + PodsRef []corev1.ObjectReference `json:"podsRef,omitempty"` +} + +type PodMigrationJobCondition struct { + // Type is the type of the condition. + Type PodMigrationJobConditionType `json:"type"` + // Status is the status of the condition. + // Can be True, False, Unknown. + Status PodMigrationJobConditionStatus `json:"status"` + // Last time we probed the condition. + // +nullable + LastProbeTime metav1.Time `json:"lastProbeTime,omitempty"` + // Last time the condition transitioned from one status to another. + // +nullable + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` + // Unique, one-word, CamelCase reason for the condition's last transition. + Reason string `json:"reason,omitempty"` + // Human-readable message indicating details about last transition. + Message string `json:"message,omitempty"` +} + +type PodMigrationJobPhase string + +const ( + // PodMigrationJobPending represents the initial status + PodMigrationJobPending PodMigrationJobPhase = "Pending" + // PodMigrationJobRunning represents the PodMigrationJob is being processed + PodMigrationJobRunning PodMigrationJobPhase = "Running" + // PodMigrationJobSucceed represents the PodMigrationJob processed successfully + PodMigrationJobSucceed PodMigrationJobPhase = "Succeed" + // PodMigrationJobFailed represents the PodMigrationJob process failed caused by Timeout, Reservation failed, etc. + PodMigrationJobFailed PodMigrationJobPhase = "Failed" + // PodMigrationJobAborted represents the user forcefully aborted the PodMigrationJob. + PodMigrationJobAborted PodMigrationJobPhase = "Aborted" +) + +type PodMigrationJobConditionType string + +// These are valid conditions of PodMigrationJob. +const ( + PodMigrationJobConditionReservationCreated PodMigrationJobConditionType = "ReservationCreated" + PodMigrationJobConditionReservationScheduled PodMigrationJobConditionType = "ReservationScheduled" + PodMigrationJobConditionWaitForConfirmPreempt PodMigrationJobConditionType = "WaitForConfirmPreempt" + PodMigrationJobConditionPreempting PodMigrationJobConditionType = "Preempting" + PodMigrationJobConditionEvicting PodMigrationJobConditionType = "Evicting" + PodMigrationJobConditionReservationWaitForBind PodMigrationJobConditionType = "WaitForBind" +) + +type PodMigrationJobConditionStatus string + +const ( + PodMigrationJobConditionStatusTrue PodMigrationJobConditionStatus = "True" + PodMigrationJobConditionStatusFalse PodMigrationJobConditionStatus = "False" + PodMigrationJobConditionStatusUnknown PodMigrationJobConditionStatus = "Unknown" +) +``` + +### Implementation Details/Notes/Constraints + +#### PodMigrationJob Controller + +The difference between `PodMigrationJobController` and general controller is that `PodMigrationJobController` will evaluate all pending PodMigrationJobs together (ie PodMigrationJob.Phase is Pending) and select a batch of PodMigrationJob and reconcile them. This selection process is called the arbitration mechanism. The reason why the arbitration mechanism is introduced is mainly to control the stability risk and control the cost of migrating Pods. The arbitration mechanism includes three stages: `Group`, `Filter` and `Sort`. + +##### Group PodMigrationJob + +Aggregate according to different workloads to facilitate the processing of subsequent processes + +- Aggregate PodMigrationJob by workload +- Aggregate PodMigrationJob by Node +- Aggregate PodMigrationJob by Namespace + +##### Filter PodMigrationJob + +- Check how many PodMigrationJob of each workload are in the Running state, and record them as ***migratingReplicas***. If the ***migratingReplicas*** reach a certain threshold, they will be excluded. The detailed algorithm of this threshold is described later. +- Check the number of ***unavailableReplicas*** of each workload, and determine whether the ***unavailableReplicas + migratingReplicas*** conform to the corresponding [PDB(Pod Disruption Budget)](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) or [PUB(Pod Unavailable Budget)](https://openkruise.io/docs/user-manuals/podunavailablebudget). If there is no PDB or PUB, use the algorithm to calculate dynamically. If not, exclude the corresponding PodMigrationJob. +- Check the number of Pods being migrated on the node where each target Pod is located. If it exceeds the maximum migration amount for a single node, exclude it. +- Check the number of Pods being migrated in the Namespace where each target Pod is located. If it exceeds the maximum migration amount for a single Namespace, exclude it + +The detailed algorithm of Workload Max Migrating/Unavailable Replicas: + +```go +func GetMaxMigrating(replicas int, intOrPercent *intstr.IntOrString) (int, error) { + return GetMaxUnavailable(replicas, intOrPercent) +} + +func GetMaxUnavailable(replicas int, intOrPercent *intstr.IntOrString) (int, error) { + if intOrPercent == nil { + if replicas > 10 { + s := intstr.FromString("10%") + intOrPercent = &s + } else if replicas >= 4 && replicas <= 10 { + s := intstr.FromInt(2) + intOrPercent = &s + } else { + s := intstr.FromInt(1) + intOrPercent = &s + } + } + return intstr.GetValueFromIntOrPercent(intOrPercent, replicas, true) +} +``` + +##### Sort PodMigrationJob + +- Pods with higher QoS requirements are given priority, LSE > LSR > LS > BE +- Pods with higher priority will be processed first +- The higher migration priority will be processed first +- If the Pod has already initiated a migration job in the past and it fails, sort by the number of times. The lower the number of times, the priority will be given to processing +- If the workload where the Pod is located has been descheduled for a certain number of times in the past, it is sorted according to the number of times. The lower the number of times, the priority will be processed. +- Sort by the number of replicas being migrated by the workload. The lower the number of replicas being migrated, the priority will be given to processing. + +##### Execute PodMigrationJob + +- Update PodMigrationJobStatus.Phase to Running to trigger the PodMigrationJob controller reconcile these jobs +- PodMigrationJob controller reconciles process: + - If the mode of PodMigrationJob is `EvictionDirectly`, just delete the Pod through the delete method that configured in PodMigrationJob controller. And update the phase of PodMigrationJob to Success. + - If not specified ReservationOptions.ReservationRef, create the Reservation instance by the reservation template or Pod spec to reserve resources. And updates the created Reservation instance to the ReservationOptions.ReservationRef. + - Check the status of Reservation to determine whether reserve resource successfully. + - If failed to reserve, abort the PodMigrationJob and update the phase of PodMigrationJob to Fail + - If successfully reserve, delete the Pod through the delete method that configured in PodMigrationJob controller. + - Check the Reservation status to determine whether the Reservation consumed. + - If Reservation consumed, tracks the status of Reservation and update the status to PodMigrationJob + - Update phase of PodMigrationJob to Success. + +##### Migration Stability mechanism + +- Support for disabling this capability by configuration +- Supports a simple central flow control mechanism to limit the number of migrations over a period of time. + +See the Configuration section for more details + +#### Controller Configuration + +User can configure the `PodMigrationJobControllerConfiguration` through Koordinator Descheduler ConfigMap. + +```go +type PodMigrationJobControllerConfiguration struct { + // Paused indicates whether the PodMigrationJob Controller should to work or not. + Paused bool `json:"paused,omitempty"` + // DryRun means only execute the entire migration logic except create Reservation or Delete Pod + // Default is false + DryRun bool `json:"dryRun,omitempty"` + + // FlowControlQPS controls the number of arbitrations per second + FlowControlQPS string `json:"flowControlQPS,omitempty"` + // FlowControlBurst is the maximum number of tokens + FlowControlBurst int32 `json:"flowControlBurst,omitempty"` + + // MaxMigratingPerNode represents he maximum number of pods that can be migrating during migrate per node. + MaxMigratingPerNode *int32 `json:"maxMigratingPerNode,omitempty"` + + // MaxMigratingPerNamespace represents he maximum number of pods that can be migrating during migrate per namespace. + MaxMigratingPerNamespace *int32 `json:"maxMigratingPerNamespace,omitempty"` + + // MaxMigratingPerWorkload represents he maximum number of pods that can be migrating during migrate per workload. + // Value can be an absolute number (ex: 5) or a percentage of desired pods (ex: 10%). + MaxMigratingPerWorkload *intstr.IntOrString `json:"maxMigratingPerWorkload,omitempty"` + + // MaxUnavailablePerWorkload represents he maximum number of pods that can be unavailable during migrate per workload. + // The unavailable state includes NotRunning/NotReady/Migrating/Evicting + // Value can be an absolute number (ex: 5) or a percentage of desired pods (ex: 10%). + MaxUnavailablePerWorkload *intstr.IntOrString `json:"maxUnavailablePerWorkload,omitempty"` + + // EvictionPolicy represents how to delete Pod, support "Delete" and "Eviction", default value is "Eviction" + EvictionPolicy string `evictionPolicy,omitempty` + // DefaultDeleteOptions defines options when deleting migrated pods and preempted pods through the method specified by EvictionPolicy + DefaultDeleteOptions *metav1.DeleteOptions `json:"defaultDeleteOptions,omitempty"` +} +``` + +## Alternatives + +## Implementation History + +- 2022-07-01: Initial proposal +- 2022-07-11: Refactor proposal for review +- 2022-07-13: Update proposal based on review comments diff --git a/docs/proposals/scheduling/20220701-schedule-gang.md b/docs/proposals/scheduling/20220701-schedule-gang.md new file mode 100644 index 000000000..4d7b50c09 --- /dev/null +++ b/docs/proposals/scheduling/20220701-schedule-gang.md @@ -0,0 +1,435 @@ +--- +title: Gang scheduling +authors: + - "@buptcozy" +reviewers: + - "@eahydra" + - "@hormes" + - "@yihuifeng" + - "@honpey" + - "@zwzhang0107" + - "@jasonliu747" +creation-date: 2022-07-01 +last-updated: 2022-07-01 + +--- + +# Gang scheduling + +## Table of Contents + + + +* [Gang scheduling](#Gang-scheduling) + * [Table of Contents](#table-of-contents) + * [Summary](#summary) + * [Motivation](#motivation) + * [Compared with competitors](#Compared-with-competitors) + * [Coescheduling](#Coescheduling) + * [Goals](#goals) + * [Non Goals and Future Work](#Non-Goals-and-Future-Work) + * [Proposal](#Proposal) + * [Key concept](#key-concept) + * [Strict-mode and non-strict-mode](#strict-mode-and-non-strict-mode) + * [GangGroup](#ganggroup) + * [After gang](#after-gang) + * [API](#API) + * [Definition](#definition) + * [CRD way](#crd-way) + * [Example](#example) + * [Annotation way](#annotation-way) + * [Example](#example) + * [Implementation Details](#Implementation-Details) + * [QueueSortPlugin](#QueueSortPlugin) + * [Data-Structure](#data-structure) + * [GangPlugin](#gang-plugin) + * [Unsolved Problems](#Unsolved-Problems) + * [Alternatives](#Alternatives) + * [Implementation History](#Implementation-History) + * [References](#References) + + +## Summary +This proposal provides Gang mechanism for the scheduler to control pods binding opportunity. User can declare a resource-collection-minimum number, +only when assigned-resources reach the given limitation can trigger the binding. We provide `strict-mode` and `non-strict-mode` to +control the resource-accumulation-process by a configuration. We also provide a two-level Gang description for better matching +the real scenario, which is different from community. + +## Motivation +In AI scenarios, lots of jobs need Gang scheduling. The community have lots of related implements such as `coescheduling` or `vocalno`. +We received lots of inspirations in the design process from them. + +### Compared with competitors + +#### Coescheduling +1. `coescheduling` implement a new queue-sort interface and other methods to let one Gang's pods get out of the queue in order as much as possible. +If a pod failed to be scheduled, the requests that have been successfully scheduled in this round of Gang scheduling cycle will be rolled back, +and the remaining pods waiting for scheduling will be rejected in PreFilter check until this scheduling cycle passed. +For example, there is a Gang requires 10 tasks to be scheduled, if first 5 tasks allocated, the 6th task failed to be scheduled, +`coescheduling` will roll-back first 5 tasks and ignore the remaining 4 tasks in this Gang scheduling cycle. `coescheduling` simply use a +global time interval to control the Gang scheduling cycle. The first defect is that the uniform time interval will cause +some problems. If the time configuration is too long, it will lead to useless waiting; If the time configuration is too short, +it will lead to useless scheduling. Secondly, it is very difficult for a large job to meet all resource requests at one time. +This mechanism will lead to a very low probability of full resources, and eventually make the job starve to death. We call this process as `strict-mode`. + +2. Some jobs have complex Gang requirements. For example, a job has several roles. Each role will have several pods +and its own Gang conditions. Jobs also need different roles to form different GangGroups. All pods in a GangGroup can +trigger the bind process only after all roles in a GangGroup meet their Gang conditions. The `coescheduling` can't meet +this requirement. + +### Goals +1. Define API to announce Gang scheduling configuration. + +2. Provides a scheduler plugin to achieve Gang scheduling ability. + +### Non Goals and Future Work +1. Provide ability to solve Gang resource deadlock problems with `non-strict-mode`. + +## Proposal + +### Key concept + +#### strict-mode and non-strict-mode + +As mentioned above, in `strict-mode`, if a pod failed to be scheduled, the pods that have been successfully scheduled in +this scheduling cycle will be rolled back, and the remaining pods waiting for scheduling will be rejected in +PreFilter check util this scheduling cycle passed. We call this mode is `strict-mode`. + +In `non-strict-mode`, if a pod failed to be scheduled, it has no impact on any other pod. We will continue to accumulate +the allocated pod until the condition of Gang is met. This process is friendly to Gangs with large number of pods, but it +will increase the risk of resource deadlock between Gangs. For example, the quota of the quota group is 10(quota will be proposed later), +and the user submits three Gangs with 5 pods. Due to various plugin constraints, Gang1\2\3 may allocate resources of 3\3\4 respectively. +Since the quota group's quota is full, there will be no new resource scheduling. We call this is resource deadlock of resource Gang. +In future proposal, we will try to fix this problem. + +#### GangGroup +As mentioned above, Some jobs have complex Gang requirements. For example, a job has several roles. Each role will have several pods +and its own Gang conditions. Jobs also need different roles to form different GangGroups. All pods in a GangGroup can +trigger the bind process only after all roles in a GangGroup meet their Gang conditions. So we introduce `GangGroup` concept, +which allow user to bundle different Gangs together. + +#### After Gang +It should be noted that, if the resource accumulation conditions of Gang are met, then some pods failed in the process of binding, +or some bound pods are preempted\rescheduled, should the constraints of Gang still be effective in the process of resource reallocation? +Because the initial purpose of Gang is to require pods to be pulled up at the same time, if some pods have been pulled up, +then the subsequent Gang behavior is meaningless. Therefore, when once Gang has been satisfied, all subsequent resource allocations +are no longer constrained by Gang rules, and their performance is similar to ordinary pod. + +As mentioned above, `WaitTime` is the max wait time since first pod comes to permit stage. If `WaitTime` is timeout, +scheduler will roll back all assumed pods, update each pod's annotation with `gang.scheduling.koordinator.sh/timeout=true`, and +won't schedule these pods anymore. User should pay attention to this status and delete pods timely. + +### API +#### Definition + +Our original intention is to improve and enhance the ability of the community's original `PodGroup`, so we will be +compatible with the way the community declares the `PodGroup`. We also provide a lighting way to just use annotations to +use Gang feature. + +#### CRD way +User can use `PodGroup` CRD in community to declare a gang: +```go +type PodGroup struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + Spec PodGroupSpec `json:"spec,omitempty"` + Status PodGroupStatus `json:"status,omitempty"` +} +type PodGroupSpec struct { + MinMember int32 `json:"minMember,omitempty"` + MinResources *v1.ResourceList `json:"minResources,omitempty"` + + ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` +} +``` +Pod should use `pod-group.scheduling.sigs.k8s.io` in label to associate with `PodGroup`. + +Also, we introduce some optional definitions as below: +```yaml +gang.scheduling.koordinator.sh/total-number +gang.scheduling.koordinator.sh/gang-mode +gang.scheduling.koordinator.sh/groups +``` + +- `gang.scheduling.koordinator.sh/total-number` helps to calculate Gang scheduling cycle in `strict mode`, you can +find more detail in `Data-Structure` chapter. Default equals to `gang.scheduling.koordinator.sh/min-available`. + +- `gang.scheduling.koordinator.sh/gang-mode` determines `strict-mode` or `non-strict-mode`. Default is `strict-mode`. + +- `gang.scheduling.koordinator.sh/groups` describes GangGroups. Default is empty, which means don't need to form a `GangGroup` with others. + +`gang.scheduling.koordinator.sh/total-number`, `gang.scheduling.koordinator.sh/gang-mode`, `gang.scheduling.koordinator.sh/gang-groups` should be found in +`PodGroup`'s annotation if needed. + +##### Example +When user apply a basic gang, the example is as follows: +```yaml +apiVersion: v1alpha1 +kind: PodGroup +metadata: + creationTimestamp: "2022-07-11T18:26:33Z" + name: gangA + namespace: default +spec: + minMember: 5 + minResources: + cpu: "5" + memory: "2048Mi" + scheduleTimeoutSeconds: 600 +``` + +Let's assume a job has two roles: A and B, each role has several pods. podA belongs to roleA, podB belongs to roleB. +roleA and roleB belongs to one GangGroup, the example is as follows: +```yaml +apiVersion: v1alpha1 +kind: PodGroup +metadata: + creationTimestamp: "2022-07-11T18:26:33Z" + name: gangA + namespace: default + annotations: + gang.scheduling.koordinator.sh/total-number: 5 + gang.scheduling.koordinator.sh/gang-mode: strict-mode + gang.scheduling.koordinator.sh/groups: ["gangA", "gangB"] +spec: + minMember: 5 + minResources: + cpu: "5" + memory: "2048Mi" + scheduleTimeoutSeconds: 600 +``` + +It should be noted that, if use Gang feature by `CRD way`, user should let high level operator maintain Gang CRD life circle +like handling `update/create/delete` events. Also, from a Scheduler perspective, scheduler should handle receive-order-issue's +between Gang CRD and pod. For example, if pods arrive to scheduler before Gang CRD, we have to build a fake Gang data structure +temporarily to collect all related pods, and need to suspend the scheduling of pods until parse the configuration from real Gang CRD. + +#### Annotation way +```yaml +gang.scheduling.koordinator.sh/name +gang.scheduling.koordinator.sh/min-available +``` + +The upper definitions are indispensable. We are compatible with `pod-group.scheduling.sigs.k8s.io`, `pod-group.scheduling.sigs.k8s.io/name` +and `pod-group.scheduling.sigs.k8s.io/min-available` in community. We also support new definitions to declare Gang's name and minimum number. + +Also, we introduce some optional definitions as below, most are mentioned above: +```yaml +gang.scheduling.koordinator.sh/waiting-time +gang.scheduling.koordinator.sh/total-number +gang.scheduling.koordinator.sh/gang-mode +gang.scheduling.koordinator.sh/groups +``` + +- `gang.scheduling.koordinator.sh/waiting-time` represents max wait time since first pod comes to permit stage. Default is a global config. + +- `gang.scheduling.koordinator.sh/total-number` helps to calculate Gang scheduling cycle in `strict mode`, you can +find more detail in `Data-Structure` chapter. Default equals to `gang.scheduling.koordinator.sh/min-available`. + +- `gang.scheduling.koordinator.sh/gang-mode` determines `strict-mode` or `non-strict-mode`. Default is `strict-mode`. + +- `gang.scheduling.koordinator.sh/groups` describes GangGroups. Default is empty, which means don't need to form a `GangGroup` with others. + +It should be noted that, the annotation mode's parameter will overwrite CRD's mode if both exist. + +##### Example +When user apply a basic gang, the example is as follows: +```yaml +metadata: + annotations: + gang.scheduling.koordinator.sh/name: gangA + gang.scheduling.koordinator.sh/min-available: 5 +``` + +Let's assume a job has two roles: A and B, each role has several pods. PodA belongs to roleA, podB belongs to roleB. +roleA and roleB belongs to one GangGroup, the example is as follows: +```yaml +metadata: + annotations: + gang.scheduling.koordinator.sh/name: gangA + gang.scheduling.koordinator.sh/waiting-time: 3600s + gang.scheduling.koordinator.sh/min-available: 5 + gang.scheduling.koordinator.sh/total-number: 5 + gang.scheduling.koordinator.sh/gang-mode: strict-mode + gang.scheduling.koordinator.sh/groups: ["gangA", "gangB"] +metadata: + annotations: + gang.scheduling.koordinator.sh/name: gangB + gang.scheduling.koordinator.sh/waiting-time: 3600s + gang.scheduling.koordinator.sh/min-available: 5 + gang.scheduling.koordinator.sh/total-number: 5 + gang.scheduling.koordinator.sh/gang-mode: strict-mode + gang.scheduling.koordinator.sh/groups: ["gangA", "gangB"] +``` + +Assuming a job has two roles: A and B, each role has several pods. podA belongs to roleA, podB belongs to roleB. +roleA and roleB belongs to different GangGroup, the example as follows: +```yaml +metadata: + annotations: + gang.scheduling.koordinator.sh/name: gangA + gang.scheduling.koordinator.sh/waiting-time: 3600s + gang.scheduling.koordinator.sh/min-available: 5 + gang.scheduling.koordinator.sh/total-number: 5 + gang.scheduling.koordinator.sh/gang-mode: strict-mode + gang.scheduling.koordinator.sh/groups: "" +metadata: + annotations: + gang.scheduling.koordinator.sh/name: gangB + gang.scheduling.koordinator.sh/waiting-time: 3600s + gang.scheduling.koordinator.sh/min-available: 5 + gang.scheduling.koordinator.sh/total-number: 5 + gang.scheduling.koordinator.sh/gang-mode: strict-mode + gang.scheduling.koordinator.sh/groups: "" +``` + +### Implementation Details +#### QueueSortPlugin + +We design an independent plugin to implement the `QueueSort` extension point separately, so that we can integrate +queue sort logic of all plugins, and register them at one time. + +In this proposal, we implement the Less function to gather pods belong to same Gang. The specific queuing rule is: + +1. Firstly, compare the priorities of the two pods, the higher priority is at the front of the queue. + +2. Secondly, compare creationTimestamp of two pods, if pod belongs to a Gang, then we compare creationTimestamp of the Gang, +the one created first will be at the front of the queue. + +3. Finally, compare pod's namespace, if pod belongs to a Gang, then we compare Gang name. + +```go +type QueueSortPlugin interface{ + QueueSort(*QueuedPodInfo, *QueuedPodInfo) bool +} +``` + +#### GangSchedulingPlugin +##### Data-Structure +###### Gang +```go +type Gang struct { + Name string + WaitTime time.Duration + Mode string //strict-mode or non-strict-mode + GangGroup []string + MinRequiredNumber int + TotalChildrenNum int + Children map[string]*PodInfo + BoundChildren map[string]*PodInfo + WaitingForBindChildren map[string]*PodInfo + ResourceSatisfied bool + ScheduleCycle int + ScheduleCycleValid bool + ChildrenScheduleRoundMap map[string]int +} +``` + +We design the Gang to record Gang status in scheduler memory. We can get the children pods from "Children" field, and the +`BoundChildren, WaitingForBindChildren` store the pods binding status, which is used to check if the pods can pass permit stage. + +Once Permit stage passed, we will set `ResourceSatisfied=true`, as mentioned above in `After Gang` chapter, this variable is +used for judging whether gang has been satisfied. when handle failover case, if any pod in Gang has been bound, we set `ResourceSatisfied=true`. + +We especially explain `scheduleCycle` and `childrenScheduleRoundMap` field. These fields control Gang's scheduling cycle. For example, +at the beginning, `scheduleCycle` is 1, and each pod's cycle in `childrenScheduleRoundMap` is 0. When each pod comes to PreFilter, +we will check if the pod's value in `childrenScheduleRoundMap` is smaller than Gang's `scheduleCycle`, If result is positive, +we set the pod's cycle in `childrenScheduleRoundMap` equal with `scheduleCycle` and pass the check. If result is negative, means +the pod has been scheduled in this cycle, so we should reject it. With `totalChildrenNum`'s help, when the last pod comes to make all +`childrenScheduleRoundMap`'s values equal to `scheduleCycle`, Gang's `scheduleCycle` will be added by 1, which means a new schedule cycle. + +We continue to explain `scheduleCycleValid` field, during the scheduling, When a pod failed at Filter stage, we will set ScheduleCycleValid to +false in PostFilter stage, which means any pod in this Gang shouldn't be scheduled until it is set to "true", +and the remaining pods should be rejected in PreFilter stage. Only When `scheduleCycle` added by 1, we will reset the `scheduleCycleValid` to true. + +It should be emphasized that `scheduleCycle\scheduleCycleValid\childrenScheduleRoundMap` only work in `strict-mode`. + +##### GangPlugin + +this is the framework of the Plugin,we cache the Gang info above in the gangCache. +```go +type GangPlugin struct { + frameworkHandler framework.Handle + gangClient gangClient.Interface + podLister listerv1.PodLister + snapshotSharedLister framework.SharedLister + gangCache map[string]*Gang +} +``` +during the whole kubernetes shceduling process,we only need to realize our logic into four extention points as below: +```go +var( + _ framework.PreFilterPlugin = &GangScheduling{} + _ framework.PostFilterPlugin = &GangScheduling{} + _ framework.PermitPlugin = &GangScheduling{} + _ framework.ReservePlugin = &Coscheduling{} +) +type GangScheduling interface{ + ActiveGang(pod *corev1.Pod, state *framework.CycleState) + PreFilter(context.Context, *corev1.Pod) error + PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status) + Permit(context.Context, *corev1.Pod) Status + Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) +} +``` +###### **PreFilter** + +if `non-strict-mode`, we only do step1 and step2: + +- Check whether childes in Gang has met the requirements of minimum number under each Gang, and reject the pod if negative. + +- Check whether the Gang has been timeout(check the pod's annotation,later introduced at Permit section), and reject the pod if positive. + +- Check whether the Gang has met the `scheduleCycleValid` check, and reject the pod if negative. + +- Try update `scheduleCycle`, `scheduleCycleValid`, `childrenScheduleRoundMap` as mentioned above. + + +###### **PostFilter** + +At this point means the pod didn't pass the Filter Plugin, we should: + +- If `strict-mode`, we will set `scheduleCycleValid` to false and release all assumed pods. + +- If `non-strict mode`, we will do nothing. + +###### **Permit** + +Any pod passes Filter stage will come to this stage. Scheduler will calculate all Gangs in GangGroup whether the current +number of assumed-pods in each Gang meets the Gang's minimum requirement. + +- If Gang don't meet the bind-condition, we will give the pod a "Wait" Status with a timeout duration, and the bind +goroutine will keep waiting until the wait is timeout or passed. Then we will run the `ActiveGang` method, it can put all +the pods belong to the Gang which in `schedulableQueue` or `backoffQueue` back to `activeQueue`, so that the pod of Gang +can be continuously scheduled as much as possible. + +It should be noted that, in community, scheduler limit maximum timeout value under 15 min, we may need to hook RunPermitPlugins +to enlarge the timeout when 15 minutes is not enough. Now we record as a known-issue. + +- If Gang meet the bind-condition, we will give every waiting pod a "Success" status, which will let the bind goroutine of +each pod leave the waiting status and continue to run. Also, as mentioned above, we will set Gang's `ResourceSatisfied` to true. + +###### **Un-reserve** + +Both permit stage is timeout and binding failed will lead the pod to un-reserve stage, we can distinguish from Gang's "ResourceSatisfied" field, +if the field is true means binding failed, else means the Gang is timeout. + +- When permit stage is timeout, we will give an annotation like `gang.scheduling.koordinator.sh/timeout=true` to all the pods +belong to the Gang and will release the resource of all the assumed pods. The Gang will not be scheduled anymore, +user should manually handle the timeout event. + +- When binding failed, as mentioned above, the collection of Gang's resource is over, we will do nothing except roll back +the failed pod resource. + +###### **Init** + +We will register pod's event handler to watch pod event for updating Gang. + +## Unsolved Problems + +## Alternatives +User can choose use Gang by `strict-mode` and `non-strict-mode` case by case. + +## Implementation History + +## References \ No newline at end of file diff --git a/docs/proposals/scheduling/20220701-simple-descheduler-framework.md b/docs/proposals/scheduling/20220701-simple-descheduler-framework.md new file mode 100644 index 000000000..e0411a11f --- /dev/null +++ b/docs/proposals/scheduling/20220701-simple-descheduler-framework.md @@ -0,0 +1,102 @@ +--- +title: Simple Descheduler Framework +authors: + - "@eahydra" +reviewers: + - "@hormes" + - "@allwmh" + - "@jasonliu747" + - "@saintube" + - "@zwzhang0107" +creation-date: 2022-07-01 +last-updated: 2022-07-15 +status: provisional +--- + +# Simple Descheduler Framework + +## Table of Contents + + + +- [Simple Descheduler Framework](#simple-descheduler-framework) + - [Table of Contents](#table-of-contents) + - [Glossary](#glossary) + - [Summary](#summary) + - [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals/Future Work](#non-goalsfuture-work) + - [Proposal](#proposal) + - [Implementation Details/Notes/Constraints](#implementation-detailsnotesconstraints) + - [Descheduler profile](#descheduler-profile) + - [Abstract PodEvictor interface](#abstract-podevictor-interface) + - [Plug-in descheduler strategy](#plug-in-descheduler-strategy) + - [Alternatives](#alternatives) + - [Implementation History](#implementation-history) + + + +## Glossary + +## Summary + +This proposal is based on the K8s community's [descheduler](https://github.com/kubernetes-sigs/descheduler) to design and implement the descheduler framework required by the koordinator. + +## Motivation + +The existing [descheduler](https://github.com/kubernetes-sigs/descheduler) in the community can solve some problems, but we think that there are still many aspects of the descheduler that can be improved, for example, it only supports the mode of periodic execution, and does not support the event-triggered mode. It is not possible to extend and configure custom rescheduling strategies without invading the existing code of descheduler like kube-scheduler; it also does not support implementing custom evictor. + +We also noticed that the K8s descheduler community also found these problems and proposed corresponding solutions such as [#753 Descheduler framework Proposal](https://github.com/kubernetes-sigs/descheduler/issues/753) and [PoC #781](https://github.com/kubernetes-sigs/descheduler/pull/781). The K8s descheduler community tries to implement a descheduler framework similar to the k8s scheduling framework. This coincides with our thinking. + +On the whole, these solutions solved most of our problems, but we also noticed that the related implementations were not merged into the main branch. But we review these implementations and discussions, and we believe this is the right direction. Considering that Koordiantor has clear milestones for descheduler-related features, we will implement Koordinator's own descheduler independently of the upstream community. We try to use some of the designs in the [#753 PR](https://github.com/kubernetes-sigs/descheduler/issues/753) proposed by the community and we will follow the Koordinator's compatibility principle with K8s to maintain compatibility with the upstream community descheduler when implementing. Such as independent implementation can also drive the evolution of the upstream community's work on the descheduler framework. And when the upstream community has new changes or switches to the architecture that Koordinator deems appropriate, Koordinator will follow up promptly and actively. + +### Goals + +1. Implement Koordinator Descheduler following part of the design in [#753](https://github.com/kubernetes-sigs/descheduler/issues/753) proposed by the community + +### Non-Goals/Future Work + +1. Break any existing use cases of the Descheduler. + +## Proposal + +### Implementation Details/Notes/Constraints + +#### Descheduler profile + +The current descheduler configuration is too simple to support disabling or enabling plugins or supporting custom plugin configurations. The [PR #587](https://github.com/kubernetes-sigs/descheduler/pull/587) introducing descheduler profiles with v1alpha2 api version. We will use this proposal as Koordiantor Descheduler's configuration API. + +- The descheduler profile API supports user specify which extension points are enabled/disabled, alongside specifying plugin configuration. Including ability to configure multiple descheduling profiles. +- The descheduling framework configuration can be converted into an internal representation. +- To reduce need to specify value for every possible configuration, also defaulting serves as a recommended/opinionated settings for the plugins. + +#### Abstract PodEvictor interface + +Currently, descheduler has split `Pod Evictor` and `Evictor Filter`. Users can inject `Evictor Filter` on demand, and the plug-in calls `Evictor Filter` when selecting abnormal Pods to select Pods that meet the requirements and calls `Pod Evictor` to initiate eviction. At present, `Pod Evictor` has not been abstracted as an interface. We adopt the solution in [PoC #781](https://github.com/kubernetes-sigs/descheduler/pull/781) to abstract an `Evictor interface`. And refer to [PR #885](https://github.com/kubernetes-sigs/descheduler/pull/885) to add an `EvictOptions` paramters. We can implement custom Evictor based on [PodMigrationJob](https://github.com/koordinator-sh/koordinator/blob/main/docs/proposals/scheduling/20220701-pod-migration-job.md). + +The `Evictor` interface defined as follows: + +```go +type EvictOptons struct { + Strategy string + Reason string +} + +type Evictor interface { + Evict(context.Context, *v1.Pod, options EvictOptions) bool +} +``` + +#### Plug-in descheduler strategy + +The current descheduler has some strategies. In [PoC #781](https://github.com/kubernetes-sigs/descheduler/pull/781), it is converted into `Plugin` and executed periodically. In this `periodic execution mode`, it is appropriate to abstract the policy for Pod and Node dimensions as `DeschedulePlugin` or `BalancePlugin`. The load hotspot descheduling capability that we will implement later can also implement the BalancePlugin interface. + +We also need to support the `event-triggered mode`, which means that descheduling is performed in the form of a Controller. +In some scenarios, CRD-oriented descheduling needs to be implemented. For example, different descheduling configurations are provided according to the workload. When some abnormality is detected in the workload, descheduling will be triggered. We can think of Controller as a special form of Plugin. When the descheduler is initialized, an instance is constructed through the plugin factory function like a normal Plugin, and then a similar Run method is called to start execution. + +## Alternatives + +## Implementation History + +- 2022-07-01: Initial proposal +- 2022-07-15: Refactor proposal for review diff --git a/go.mod b/go.mod index 54ead1b7a..c2d73e42f 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/koordinator-sh/koordinator go 1.17 require ( + github.com/NVIDIA/go-nvml v0.11.6-0.0.20220715143214-a79f46f2a6f7 github.com/docker/docker v20.10.17+incompatible github.com/fsnotify/fsnotify v1.5.4 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da @@ -13,19 +14,20 @@ require ( github.com/prometheus/client_golang v1.12.2 github.com/spf13/cobra v1.1.3 github.com/spf13/pflag v1.0.5 - github.com/stretchr/testify v1.7.5 + github.com/stretchr/testify v1.8.0 go.uber.org/atomic v1.9.0 golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac google.golang.org/grpc v1.38.0 google.golang.org/protobuf v1.28.0 - gorm.io/driver/sqlite v1.3.4 - gorm.io/gorm v1.23.6 + gorm.io/driver/sqlite v1.3.6 + gorm.io/gorm v1.23.8 k8s.io/api v0.22.6 k8s.io/apimachinery v0.22.6 k8s.io/apiserver v0.22.6 k8s.io/client-go v0.22.6 k8s.io/code-generator v0.22.6 k8s.io/component-base v0.22.6 + k8s.io/component-helpers v0.22.6 k8s.io/cri-api v0.22.6 k8s.io/klog/v2 v2.10.0 k8s.io/kube-scheduler v0.22.6 @@ -132,7 +134,6 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.22.2 // indirect k8s.io/cloud-provider v0.22.6 // indirect - k8s.io/component-helpers v0.22.6 // indirect k8s.io/csi-translation-lib v0.22.6 // indirect k8s.io/gengo v0.0.0-20201214224949-b6c5ce23f027 // indirect k8s.io/kube-openapi v0.0.0-20211109043538-20434351676c // indirect diff --git a/go.sum b/go.sum index f05086a6a..a5932f2c0 100644 --- a/go.sum +++ b/go.sum @@ -61,6 +61,8 @@ github.com/Microsoft/go-winio v0.4.15 h1:qkLXKzb1QoVatRyd/YlXZ/Kg0m5K3SPuoD82jjS github.com/Microsoft/go-winio v0.4.15/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw= github.com/Microsoft/hcsshim v0.8.10-0.20200715222032-5eafd1556990 h1:1xpVY4dSUSbW3PcSGxZJhI8Z+CJiqbd933kM7HIinTc= github.com/Microsoft/hcsshim v0.8.10-0.20200715222032-5eafd1556990/go.mod h1:ay/0dTb7NsG8QMDfsRfLHgZo/6xAJShLe1+ePPflihk= +github.com/NVIDIA/go-nvml v0.11.6-0.0.20220715143214-a79f46f2a6f7 h1:yl06QAxbf3g3VZX5/7CaZfTUZqFztdZZvlq0TfAvQrk= +github.com/NVIDIA/go-nvml v0.11.6-0.0.20220715143214-a79f46f2a6f7/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I= github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c= @@ -609,8 +611,8 @@ github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.5 h1:s5PTfem8p8EbKQOctVV53k6jCJt3UX4IEJzwh+C324Q= -github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= @@ -1102,11 +1104,11 @@ gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gorm.io/driver/sqlite v1.3.4 h1:NnFOPVfzi4CPsJPH4wXr6rMkPb4ElHEqKMvrsx9c9Fk= -gorm.io/driver/sqlite v1.3.4/go.mod h1:B+8GyC9K7VgzJAcrcXMRPdnMcck+8FgJynEehEPM16U= +gorm.io/driver/sqlite v1.3.6 h1:Fi8xNYCUplOqWiPa3/GuCeowRNBRGTf62DEmhMDHeQQ= +gorm.io/driver/sqlite v1.3.6/go.mod h1:Sg1/pvnKtbQ7jLXxfZa+jSHvoX8hoZA8cn4xllOMTgE= gorm.io/gorm v1.23.4/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk= -gorm.io/gorm v1.23.6 h1:KFLdNgri4ExFFGTRGGFWON2P1ZN28+9SJRN8voOoYe0= -gorm.io/gorm v1.23.6/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk= +gorm.io/gorm v1.23.8 h1:h8sGJ+biDgBA1AD1Ha9gFCx7h8npU7AsLdlkX0n2TpE= +gorm.io/gorm v1.23.8/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk= gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= diff --git a/hack/run-test.sh b/hack/run-test.sh index d1beac7d5..3e4646aa1 100755 --- a/hack/run-test.sh +++ b/hack/run-test.sh @@ -38,7 +38,7 @@ function run_test() { go test -timeout 30s "github.com/koordinator-sh/koordinator/$pkg" -v done <<< "$pkgs" else - pkgs=`grep "$pattern" -r ./pkg | cut -d: -f1 | grep -E ".go$" | xargs -i dirname {}` + pkgs=`grep "$pattern" -r ./pkg | cut -d: -f1 | grep -E ".go$" | xargs -i dirname {} | sort -u` while read -r line do pkg=${line#".//"} diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/device.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/device.go new file mode 100644 index 000000000..3ba5b0584 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/device.go @@ -0,0 +1,184 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "context" + "time" + + v1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + scheme "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned/scheme" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// DevicesGetter has a method to return a DeviceInterface. +// A group's client should implement this interface. +type DevicesGetter interface { + Devices() DeviceInterface +} + +// DeviceInterface has methods to work with Device resources. +type DeviceInterface interface { + Create(ctx context.Context, device *v1alpha1.Device, opts v1.CreateOptions) (*v1alpha1.Device, error) + Update(ctx context.Context, device *v1alpha1.Device, opts v1.UpdateOptions) (*v1alpha1.Device, error) + UpdateStatus(ctx context.Context, device *v1alpha1.Device, opts v1.UpdateOptions) (*v1alpha1.Device, error) + Delete(ctx context.Context, name string, opts v1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error + Get(ctx context.Context, name string, opts v1.GetOptions) (*v1alpha1.Device, error) + List(ctx context.Context, opts v1.ListOptions) (*v1alpha1.DeviceList, error) + Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.Device, err error) + DeviceExpansion +} + +// devices implements DeviceInterface +type devices struct { + client rest.Interface +} + +// newDevices returns a Devices +func newDevices(c *SchedulingV1alpha1Client) *devices { + return &devices{ + client: c.RESTClient(), + } +} + +// Get takes name of the device, and returns the corresponding device object, and an error if there is any. +func (c *devices) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1alpha1.Device, err error) { + result = &v1alpha1.Device{} + err = c.client.Get(). + Resource("devices"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(ctx). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of Devices that match those selectors. +func (c *devices) List(ctx context.Context, opts v1.ListOptions) (result *v1alpha1.DeviceList, err error) { + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + result = &v1alpha1.DeviceList{} + err = c.client.Get(). + Resource("devices"). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Do(ctx). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested devices. +func (c *devices) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) { + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + opts.Watch = true + return c.client.Get(). + Resource("devices"). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Watch(ctx) +} + +// Create takes the representation of a device and creates it. Returns the server's representation of the device, and an error, if there is any. +func (c *devices) Create(ctx context.Context, device *v1alpha1.Device, opts v1.CreateOptions) (result *v1alpha1.Device, err error) { + result = &v1alpha1.Device{} + err = c.client.Post(). + Resource("devices"). + VersionedParams(&opts, scheme.ParameterCodec). + Body(device). + Do(ctx). + Into(result) + return +} + +// Update takes the representation of a device and updates it. Returns the server's representation of the device, and an error, if there is any. +func (c *devices) Update(ctx context.Context, device *v1alpha1.Device, opts v1.UpdateOptions) (result *v1alpha1.Device, err error) { + result = &v1alpha1.Device{} + err = c.client.Put(). + Resource("devices"). + Name(device.Name). + VersionedParams(&opts, scheme.ParameterCodec). + Body(device). + Do(ctx). + Into(result) + return +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *devices) UpdateStatus(ctx context.Context, device *v1alpha1.Device, opts v1.UpdateOptions) (result *v1alpha1.Device, err error) { + result = &v1alpha1.Device{} + err = c.client.Put(). + Resource("devices"). + Name(device.Name). + SubResource("status"). + VersionedParams(&opts, scheme.ParameterCodec). + Body(device). + Do(ctx). + Into(result) + return +} + +// Delete takes name of the device and deletes it. Returns an error if one occurs. +func (c *devices) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { + return c.client.Delete(). + Resource("devices"). + Name(name). + Body(&opts). + Do(ctx). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *devices) DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error { + var timeout time.Duration + if listOpts.TimeoutSeconds != nil { + timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second + } + return c.client.Delete(). + Resource("devices"). + VersionedParams(&listOpts, scheme.ParameterCodec). + Timeout(timeout). + Body(&opts). + Do(ctx). + Error() +} + +// Patch applies the patch and returns the patched device. +func (c *devices) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.Device, err error) { + result = &v1alpha1.Device{} + err = c.client.Patch(pt). + Resource("devices"). + Name(name). + SubResource(subresources...). + VersionedParams(&opts, scheme.ParameterCodec). + Body(data). + Do(ctx). + Into(result) + return +} diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_device.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_device.go new file mode 100644 index 000000000..95ec607dc --- /dev/null +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_device.go @@ -0,0 +1,133 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + "context" + + v1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + schema "k8s.io/apimachinery/pkg/runtime/schema" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeDevices implements DeviceInterface +type FakeDevices struct { + Fake *FakeSchedulingV1alpha1 +} + +var devicesResource = schema.GroupVersionResource{Group: "scheduling.koordinator.sh", Version: "v1alpha1", Resource: "devices"} + +var devicesKind = schema.GroupVersionKind{Group: "scheduling.koordinator.sh", Version: "v1alpha1", Kind: "Device"} + +// Get takes name of the device, and returns the corresponding device object, and an error if there is any. +func (c *FakeDevices) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1alpha1.Device, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootGetAction(devicesResource, name), &v1alpha1.Device{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.Device), err +} + +// List takes label and field selectors, and returns the list of Devices that match those selectors. +func (c *FakeDevices) List(ctx context.Context, opts v1.ListOptions) (result *v1alpha1.DeviceList, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootListAction(devicesResource, devicesKind, opts), &v1alpha1.DeviceList{}) + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1alpha1.DeviceList{ListMeta: obj.(*v1alpha1.DeviceList).ListMeta} + for _, item := range obj.(*v1alpha1.DeviceList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested devices. +func (c *FakeDevices) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewRootWatchAction(devicesResource, opts)) +} + +// Create takes the representation of a device and creates it. Returns the server's representation of the device, and an error, if there is any. +func (c *FakeDevices) Create(ctx context.Context, device *v1alpha1.Device, opts v1.CreateOptions) (result *v1alpha1.Device, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootCreateAction(devicesResource, device), &v1alpha1.Device{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.Device), err +} + +// Update takes the representation of a device and updates it. Returns the server's representation of the device, and an error, if there is any. +func (c *FakeDevices) Update(ctx context.Context, device *v1alpha1.Device, opts v1.UpdateOptions) (result *v1alpha1.Device, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootUpdateAction(devicesResource, device), &v1alpha1.Device{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.Device), err +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *FakeDevices) UpdateStatus(ctx context.Context, device *v1alpha1.Device, opts v1.UpdateOptions) (*v1alpha1.Device, error) { + obj, err := c.Fake. + Invokes(testing.NewRootUpdateSubresourceAction(devicesResource, "status", device), &v1alpha1.Device{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.Device), err +} + +// Delete takes name of the device and deletes it. Returns an error if one occurs. +func (c *FakeDevices) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewRootDeleteAction(devicesResource, name), &v1alpha1.Device{}) + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeDevices) DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error { + action := testing.NewRootDeleteCollectionAction(devicesResource, listOpts) + + _, err := c.Fake.Invokes(action, &v1alpha1.DeviceList{}) + return err +} + +// Patch applies the patch and returns the patched device. +func (c *FakeDevices) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.Device, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootPatchSubresourceAction(devicesResource, name, pt, data, subresources...), &v1alpha1.Device{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.Device), err +} diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_podmigrationjob.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_podmigrationjob.go new file mode 100644 index 000000000..feea40ae0 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_podmigrationjob.go @@ -0,0 +1,133 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + "context" + + v1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + schema "k8s.io/apimachinery/pkg/runtime/schema" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakePodMigrationJobs implements PodMigrationJobInterface +type FakePodMigrationJobs struct { + Fake *FakeSchedulingV1alpha1 +} + +var podmigrationjobsResource = schema.GroupVersionResource{Group: "scheduling.koordinator.sh", Version: "v1alpha1", Resource: "podmigrationjobs"} + +var podmigrationjobsKind = schema.GroupVersionKind{Group: "scheduling.koordinator.sh", Version: "v1alpha1", Kind: "PodMigrationJob"} + +// Get takes name of the podMigrationJob, and returns the corresponding podMigrationJob object, and an error if there is any. +func (c *FakePodMigrationJobs) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1alpha1.PodMigrationJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootGetAction(podmigrationjobsResource, name), &v1alpha1.PodMigrationJob{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.PodMigrationJob), err +} + +// List takes label and field selectors, and returns the list of PodMigrationJobs that match those selectors. +func (c *FakePodMigrationJobs) List(ctx context.Context, opts v1.ListOptions) (result *v1alpha1.PodMigrationJobList, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootListAction(podmigrationjobsResource, podmigrationjobsKind, opts), &v1alpha1.PodMigrationJobList{}) + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1alpha1.PodMigrationJobList{ListMeta: obj.(*v1alpha1.PodMigrationJobList).ListMeta} + for _, item := range obj.(*v1alpha1.PodMigrationJobList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested podMigrationJobs. +func (c *FakePodMigrationJobs) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewRootWatchAction(podmigrationjobsResource, opts)) +} + +// Create takes the representation of a podMigrationJob and creates it. Returns the server's representation of the podMigrationJob, and an error, if there is any. +func (c *FakePodMigrationJobs) Create(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.CreateOptions) (result *v1alpha1.PodMigrationJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootCreateAction(podmigrationjobsResource, podMigrationJob), &v1alpha1.PodMigrationJob{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.PodMigrationJob), err +} + +// Update takes the representation of a podMigrationJob and updates it. Returns the server's representation of the podMigrationJob, and an error, if there is any. +func (c *FakePodMigrationJobs) Update(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.UpdateOptions) (result *v1alpha1.PodMigrationJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootUpdateAction(podmigrationjobsResource, podMigrationJob), &v1alpha1.PodMigrationJob{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.PodMigrationJob), err +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *FakePodMigrationJobs) UpdateStatus(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.UpdateOptions) (*v1alpha1.PodMigrationJob, error) { + obj, err := c.Fake. + Invokes(testing.NewRootUpdateSubresourceAction(podmigrationjobsResource, "status", podMigrationJob), &v1alpha1.PodMigrationJob{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.PodMigrationJob), err +} + +// Delete takes name of the podMigrationJob and deletes it. Returns an error if one occurs. +func (c *FakePodMigrationJobs) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewRootDeleteAction(podmigrationjobsResource, name), &v1alpha1.PodMigrationJob{}) + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakePodMigrationJobs) DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error { + action := testing.NewRootDeleteCollectionAction(podmigrationjobsResource, listOpts) + + _, err := c.Fake.Invokes(action, &v1alpha1.PodMigrationJobList{}) + return err +} + +// Patch applies the patch and returns the patched podMigrationJob. +func (c *FakePodMigrationJobs) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.PodMigrationJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootPatchSubresourceAction(podmigrationjobsResource, name, pt, data, subresources...), &v1alpha1.PodMigrationJob{}) + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.PodMigrationJob), err +} diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_reservation.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_reservation.go index 682a499a3..e1faa9120 100644 --- a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_reservation.go +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_reservation.go @@ -33,6 +33,7 @@ import ( // FakeReservations implements ReservationInterface type FakeReservations struct { Fake *FakeSchedulingV1alpha1 + ns string } var reservationsResource = schema.GroupVersionResource{Group: "scheduling.koordinator.sh", Version: "v1alpha1", Resource: "reservations"} @@ -42,7 +43,8 @@ var reservationsKind = schema.GroupVersionKind{Group: "scheduling.koordinator.sh // Get takes name of the reservation, and returns the corresponding reservation object, and an error if there is any. func (c *FakeReservations) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1alpha1.Reservation, err error) { obj, err := c.Fake. - Invokes(testing.NewRootGetAction(reservationsResource, name), &v1alpha1.Reservation{}) + Invokes(testing.NewGetAction(reservationsResource, c.ns, name), &v1alpha1.Reservation{}) + if obj == nil { return nil, err } @@ -52,7 +54,8 @@ func (c *FakeReservations) Get(ctx context.Context, name string, options v1.GetO // List takes label and field selectors, and returns the list of Reservations that match those selectors. func (c *FakeReservations) List(ctx context.Context, opts v1.ListOptions) (result *v1alpha1.ReservationList, err error) { obj, err := c.Fake. - Invokes(testing.NewRootListAction(reservationsResource, reservationsKind, opts), &v1alpha1.ReservationList{}) + Invokes(testing.NewListAction(reservationsResource, reservationsKind, c.ns, opts), &v1alpha1.ReservationList{}) + if obj == nil { return nil, err } @@ -73,13 +76,15 @@ func (c *FakeReservations) List(ctx context.Context, opts v1.ListOptions) (resul // Watch returns a watch.Interface that watches the requested reservations. func (c *FakeReservations) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) { return c.Fake. - InvokesWatch(testing.NewRootWatchAction(reservationsResource, opts)) + InvokesWatch(testing.NewWatchAction(reservationsResource, c.ns, opts)) + } // Create takes the representation of a reservation and creates it. Returns the server's representation of the reservation, and an error, if there is any. func (c *FakeReservations) Create(ctx context.Context, reservation *v1alpha1.Reservation, opts v1.CreateOptions) (result *v1alpha1.Reservation, err error) { obj, err := c.Fake. - Invokes(testing.NewRootCreateAction(reservationsResource, reservation), &v1alpha1.Reservation{}) + Invokes(testing.NewCreateAction(reservationsResource, c.ns, reservation), &v1alpha1.Reservation{}) + if obj == nil { return nil, err } @@ -89,7 +94,8 @@ func (c *FakeReservations) Create(ctx context.Context, reservation *v1alpha1.Res // Update takes the representation of a reservation and updates it. Returns the server's representation of the reservation, and an error, if there is any. func (c *FakeReservations) Update(ctx context.Context, reservation *v1alpha1.Reservation, opts v1.UpdateOptions) (result *v1alpha1.Reservation, err error) { obj, err := c.Fake. - Invokes(testing.NewRootUpdateAction(reservationsResource, reservation), &v1alpha1.Reservation{}) + Invokes(testing.NewUpdateAction(reservationsResource, c.ns, reservation), &v1alpha1.Reservation{}) + if obj == nil { return nil, err } @@ -100,7 +106,8 @@ func (c *FakeReservations) Update(ctx context.Context, reservation *v1alpha1.Res // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). func (c *FakeReservations) UpdateStatus(ctx context.Context, reservation *v1alpha1.Reservation, opts v1.UpdateOptions) (*v1alpha1.Reservation, error) { obj, err := c.Fake. - Invokes(testing.NewRootUpdateSubresourceAction(reservationsResource, "status", reservation), &v1alpha1.Reservation{}) + Invokes(testing.NewUpdateSubresourceAction(reservationsResource, "status", c.ns, reservation), &v1alpha1.Reservation{}) + if obj == nil { return nil, err } @@ -110,13 +117,14 @@ func (c *FakeReservations) UpdateStatus(ctx context.Context, reservation *v1alph // Delete takes name of the reservation and deletes it. Returns an error if one occurs. func (c *FakeReservations) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { _, err := c.Fake. - Invokes(testing.NewRootDeleteAction(reservationsResource, name), &v1alpha1.Reservation{}) + Invokes(testing.NewDeleteAction(reservationsResource, c.ns, name), &v1alpha1.Reservation{}) + return err } // DeleteCollection deletes a collection of objects. func (c *FakeReservations) DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error { - action := testing.NewRootDeleteCollectionAction(reservationsResource, listOpts) + action := testing.NewDeleteCollectionAction(reservationsResource, c.ns, listOpts) _, err := c.Fake.Invokes(action, &v1alpha1.ReservationList{}) return err @@ -125,7 +133,8 @@ func (c *FakeReservations) DeleteCollection(ctx context.Context, opts v1.DeleteO // Patch applies the patch and returns the patched reservation. func (c *FakeReservations) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.Reservation, err error) { obj, err := c.Fake. - Invokes(testing.NewRootPatchSubresourceAction(reservationsResource, name, pt, data, subresources...), &v1alpha1.Reservation{}) + Invokes(testing.NewPatchSubresourceAction(reservationsResource, c.ns, name, pt, data, subresources...), &v1alpha1.Reservation{}) + if obj == nil { return nil, err } diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_scheduling_client.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_scheduling_client.go index 7157b777f..533d9a66f 100644 --- a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_scheduling_client.go +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/fake/fake_scheduling_client.go @@ -28,8 +28,16 @@ type FakeSchedulingV1alpha1 struct { *testing.Fake } -func (c *FakeSchedulingV1alpha1) Reservations() v1alpha1.ReservationInterface { - return &FakeReservations{c} +func (c *FakeSchedulingV1alpha1) Devices() v1alpha1.DeviceInterface { + return &FakeDevices{c} +} + +func (c *FakeSchedulingV1alpha1) PodMigrationJobs() v1alpha1.PodMigrationJobInterface { + return &FakePodMigrationJobs{c} +} + +func (c *FakeSchedulingV1alpha1) Reservations(namespace string) v1alpha1.ReservationInterface { + return &FakeReservations{c, namespace} } // RESTClient returns a RESTClient that is used to communicate diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/generated_expansion.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/generated_expansion.go index 8a9cb0f3b..f8594f8b5 100644 --- a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/generated_expansion.go +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/generated_expansion.go @@ -18,4 +18,8 @@ limitations under the License. package v1alpha1 +type DeviceExpansion interface{} + +type PodMigrationJobExpansion interface{} + type ReservationExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/podmigrationjob.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/podmigrationjob.go new file mode 100644 index 000000000..c3389ca24 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/podmigrationjob.go @@ -0,0 +1,184 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "context" + "time" + + v1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + scheme "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned/scheme" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// PodMigrationJobsGetter has a method to return a PodMigrationJobInterface. +// A group's client should implement this interface. +type PodMigrationJobsGetter interface { + PodMigrationJobs() PodMigrationJobInterface +} + +// PodMigrationJobInterface has methods to work with PodMigrationJob resources. +type PodMigrationJobInterface interface { + Create(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.CreateOptions) (*v1alpha1.PodMigrationJob, error) + Update(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.UpdateOptions) (*v1alpha1.PodMigrationJob, error) + UpdateStatus(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.UpdateOptions) (*v1alpha1.PodMigrationJob, error) + Delete(ctx context.Context, name string, opts v1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error + Get(ctx context.Context, name string, opts v1.GetOptions) (*v1alpha1.PodMigrationJob, error) + List(ctx context.Context, opts v1.ListOptions) (*v1alpha1.PodMigrationJobList, error) + Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.PodMigrationJob, err error) + PodMigrationJobExpansion +} + +// podMigrationJobs implements PodMigrationJobInterface +type podMigrationJobs struct { + client rest.Interface +} + +// newPodMigrationJobs returns a PodMigrationJobs +func newPodMigrationJobs(c *SchedulingV1alpha1Client) *podMigrationJobs { + return &podMigrationJobs{ + client: c.RESTClient(), + } +} + +// Get takes name of the podMigrationJob, and returns the corresponding podMigrationJob object, and an error if there is any. +func (c *podMigrationJobs) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1alpha1.PodMigrationJob, err error) { + result = &v1alpha1.PodMigrationJob{} + err = c.client.Get(). + Resource("podmigrationjobs"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(ctx). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of PodMigrationJobs that match those selectors. +func (c *podMigrationJobs) List(ctx context.Context, opts v1.ListOptions) (result *v1alpha1.PodMigrationJobList, err error) { + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + result = &v1alpha1.PodMigrationJobList{} + err = c.client.Get(). + Resource("podmigrationjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Do(ctx). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested podMigrationJobs. +func (c *podMigrationJobs) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) { + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + opts.Watch = true + return c.client.Get(). + Resource("podmigrationjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Watch(ctx) +} + +// Create takes the representation of a podMigrationJob and creates it. Returns the server's representation of the podMigrationJob, and an error, if there is any. +func (c *podMigrationJobs) Create(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.CreateOptions) (result *v1alpha1.PodMigrationJob, err error) { + result = &v1alpha1.PodMigrationJob{} + err = c.client.Post(). + Resource("podmigrationjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Body(podMigrationJob). + Do(ctx). + Into(result) + return +} + +// Update takes the representation of a podMigrationJob and updates it. Returns the server's representation of the podMigrationJob, and an error, if there is any. +func (c *podMigrationJobs) Update(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.UpdateOptions) (result *v1alpha1.PodMigrationJob, err error) { + result = &v1alpha1.PodMigrationJob{} + err = c.client.Put(). + Resource("podmigrationjobs"). + Name(podMigrationJob.Name). + VersionedParams(&opts, scheme.ParameterCodec). + Body(podMigrationJob). + Do(ctx). + Into(result) + return +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *podMigrationJobs) UpdateStatus(ctx context.Context, podMigrationJob *v1alpha1.PodMigrationJob, opts v1.UpdateOptions) (result *v1alpha1.PodMigrationJob, err error) { + result = &v1alpha1.PodMigrationJob{} + err = c.client.Put(). + Resource("podmigrationjobs"). + Name(podMigrationJob.Name). + SubResource("status"). + VersionedParams(&opts, scheme.ParameterCodec). + Body(podMigrationJob). + Do(ctx). + Into(result) + return +} + +// Delete takes name of the podMigrationJob and deletes it. Returns an error if one occurs. +func (c *podMigrationJobs) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { + return c.client.Delete(). + Resource("podmigrationjobs"). + Name(name). + Body(&opts). + Do(ctx). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *podMigrationJobs) DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error { + var timeout time.Duration + if listOpts.TimeoutSeconds != nil { + timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second + } + return c.client.Delete(). + Resource("podmigrationjobs"). + VersionedParams(&listOpts, scheme.ParameterCodec). + Timeout(timeout). + Body(&opts). + Do(ctx). + Error() +} + +// Patch applies the patch and returns the patched podMigrationJob. +func (c *podMigrationJobs) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.PodMigrationJob, err error) { + result = &v1alpha1.PodMigrationJob{} + err = c.client.Patch(pt). + Resource("podmigrationjobs"). + Name(name). + SubResource(subresources...). + VersionedParams(&opts, scheme.ParameterCodec). + Body(data). + Do(ctx). + Into(result) + return +} diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/reservation.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/reservation.go index 32e3f37e8..0beb4061d 100644 --- a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/reservation.go +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/reservation.go @@ -33,7 +33,7 @@ import ( // ReservationsGetter has a method to return a ReservationInterface. // A group's client should implement this interface. type ReservationsGetter interface { - Reservations() ReservationInterface + Reservations(namespace string) ReservationInterface } // ReservationInterface has methods to work with Reservation resources. @@ -53,12 +53,14 @@ type ReservationInterface interface { // reservations implements ReservationInterface type reservations struct { client rest.Interface + ns string } // newReservations returns a Reservations -func newReservations(c *SchedulingV1alpha1Client) *reservations { +func newReservations(c *SchedulingV1alpha1Client, namespace string) *reservations { return &reservations{ client: c.RESTClient(), + ns: namespace, } } @@ -66,6 +68,7 @@ func newReservations(c *SchedulingV1alpha1Client) *reservations { func (c *reservations) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1alpha1.Reservation, err error) { result = &v1alpha1.Reservation{} err = c.client.Get(). + Namespace(c.ns). Resource("reservations"). Name(name). VersionedParams(&options, scheme.ParameterCodec). @@ -82,6 +85,7 @@ func (c *reservations) List(ctx context.Context, opts v1.ListOptions) (result *v } result = &v1alpha1.ReservationList{} err = c.client.Get(). + Namespace(c.ns). Resource("reservations"). VersionedParams(&opts, scheme.ParameterCodec). Timeout(timeout). @@ -98,6 +102,7 @@ func (c *reservations) Watch(ctx context.Context, opts v1.ListOptions) (watch.In } opts.Watch = true return c.client.Get(). + Namespace(c.ns). Resource("reservations"). VersionedParams(&opts, scheme.ParameterCodec). Timeout(timeout). @@ -108,6 +113,7 @@ func (c *reservations) Watch(ctx context.Context, opts v1.ListOptions) (watch.In func (c *reservations) Create(ctx context.Context, reservation *v1alpha1.Reservation, opts v1.CreateOptions) (result *v1alpha1.Reservation, err error) { result = &v1alpha1.Reservation{} err = c.client.Post(). + Namespace(c.ns). Resource("reservations"). VersionedParams(&opts, scheme.ParameterCodec). Body(reservation). @@ -120,6 +126,7 @@ func (c *reservations) Create(ctx context.Context, reservation *v1alpha1.Reserva func (c *reservations) Update(ctx context.Context, reservation *v1alpha1.Reservation, opts v1.UpdateOptions) (result *v1alpha1.Reservation, err error) { result = &v1alpha1.Reservation{} err = c.client.Put(). + Namespace(c.ns). Resource("reservations"). Name(reservation.Name). VersionedParams(&opts, scheme.ParameterCodec). @@ -134,6 +141,7 @@ func (c *reservations) Update(ctx context.Context, reservation *v1alpha1.Reserva func (c *reservations) UpdateStatus(ctx context.Context, reservation *v1alpha1.Reservation, opts v1.UpdateOptions) (result *v1alpha1.Reservation, err error) { result = &v1alpha1.Reservation{} err = c.client.Put(). + Namespace(c.ns). Resource("reservations"). Name(reservation.Name). SubResource("status"). @@ -147,6 +155,7 @@ func (c *reservations) UpdateStatus(ctx context.Context, reservation *v1alpha1.R // Delete takes name of the reservation and deletes it. Returns an error if one occurs. func (c *reservations) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { return c.client.Delete(). + Namespace(c.ns). Resource("reservations"). Name(name). Body(&opts). @@ -161,6 +170,7 @@ func (c *reservations) DeleteCollection(ctx context.Context, opts v1.DeleteOptio timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second } return c.client.Delete(). + Namespace(c.ns). Resource("reservations"). VersionedParams(&listOpts, scheme.ParameterCodec). Timeout(timeout). @@ -173,6 +183,7 @@ func (c *reservations) DeleteCollection(ctx context.Context, opts v1.DeleteOptio func (c *reservations) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.Reservation, err error) { result = &v1alpha1.Reservation{} err = c.client.Patch(pt). + Namespace(c.ns). Resource("reservations"). Name(name). SubResource(subresources...). diff --git a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/scheduling_client.go b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/scheduling_client.go index 45431942f..7f7261c6c 100644 --- a/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/scheduling_client.go +++ b/pkg/client/clientset/versioned/typed/scheduling/v1alpha1/scheduling_client.go @@ -26,6 +26,8 @@ import ( type SchedulingV1alpha1Interface interface { RESTClient() rest.Interface + DevicesGetter + PodMigrationJobsGetter ReservationsGetter } @@ -34,8 +36,16 @@ type SchedulingV1alpha1Client struct { restClient rest.Interface } -func (c *SchedulingV1alpha1Client) Reservations() ReservationInterface { - return newReservations(c) +func (c *SchedulingV1alpha1Client) Devices() DeviceInterface { + return newDevices(c) +} + +func (c *SchedulingV1alpha1Client) PodMigrationJobs() PodMigrationJobInterface { + return newPodMigrationJobs(c) +} + +func (c *SchedulingV1alpha1Client) Reservations(namespace string) ReservationInterface { + return newReservations(c, namespace) } // NewForConfig creates a new SchedulingV1alpha1Client for the given config. diff --git a/pkg/client/informers/externalversions/generic.go b/pkg/client/informers/externalversions/generic.go index 0304aae86..697b060bc 100644 --- a/pkg/client/informers/externalversions/generic.go +++ b/pkg/client/informers/externalversions/generic.go @@ -59,6 +59,10 @@ func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource return &genericInformer{resource: resource.GroupResource(), informer: f.Config().V1alpha1().ClusterColocationProfiles().Informer()}, nil // Group=scheduling, Version=v1alpha1 + case schedulingv1alpha1.SchemeGroupVersion.WithResource("devices"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Scheduling().V1alpha1().Devices().Informer()}, nil + case schedulingv1alpha1.SchemeGroupVersion.WithResource("podmigrationjobs"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Scheduling().V1alpha1().PodMigrationJobs().Informer()}, nil case schedulingv1alpha1.SchemeGroupVersion.WithResource("reservations"): return &genericInformer{resource: resource.GroupResource(), informer: f.Scheduling().V1alpha1().Reservations().Informer()}, nil diff --git a/pkg/client/informers/externalversions/scheduling/v1alpha1/device.go b/pkg/client/informers/externalversions/scheduling/v1alpha1/device.go new file mode 100644 index 000000000..257d333f2 --- /dev/null +++ b/pkg/client/informers/externalversions/scheduling/v1alpha1/device.go @@ -0,0 +1,89 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "context" + time "time" + + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + versioned "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned" + internalinterfaces "github.com/koordinator-sh/koordinator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha1 "github.com/koordinator-sh/koordinator/pkg/client/listers/scheduling/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" +) + +// DeviceInformer provides access to a shared informer and lister for +// Devices. +type DeviceInformer interface { + Informer() cache.SharedIndexInformer + Lister() v1alpha1.DeviceLister +} + +type deviceInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// NewDeviceInformer constructs a new informer for Device type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewDeviceInformer(client versioned.Interface, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredDeviceInformer(client, resyncPeriod, indexers, nil) +} + +// NewFilteredDeviceInformer constructs a new informer for Device type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredDeviceInformer(client versioned.Interface, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.SchedulingV1alpha1().Devices().List(context.TODO(), options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.SchedulingV1alpha1().Devices().Watch(context.TODO(), options) + }, + }, + &schedulingv1alpha1.Device{}, + resyncPeriod, + indexers, + ) +} + +func (f *deviceInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredDeviceInformer(client, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *deviceInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&schedulingv1alpha1.Device{}, f.defaultInformer) +} + +func (f *deviceInformer) Lister() v1alpha1.DeviceLister { + return v1alpha1.NewDeviceLister(f.Informer().GetIndexer()) +} diff --git a/pkg/client/informers/externalversions/scheduling/v1alpha1/interface.go b/pkg/client/informers/externalversions/scheduling/v1alpha1/interface.go index e5ac6bc85..5d2b4d607 100644 --- a/pkg/client/informers/externalversions/scheduling/v1alpha1/interface.go +++ b/pkg/client/informers/externalversions/scheduling/v1alpha1/interface.go @@ -24,6 +24,10 @@ import ( // Interface provides access to all the informers in this group version. type Interface interface { + // Devices returns a DeviceInformer. + Devices() DeviceInformer + // PodMigrationJobs returns a PodMigrationJobInformer. + PodMigrationJobs() PodMigrationJobInformer // Reservations returns a ReservationInformer. Reservations() ReservationInformer } @@ -39,7 +43,17 @@ func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakList return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} } +// Devices returns a DeviceInformer. +func (v *version) Devices() DeviceInformer { + return &deviceInformer{factory: v.factory, tweakListOptions: v.tweakListOptions} +} + +// PodMigrationJobs returns a PodMigrationJobInformer. +func (v *version) PodMigrationJobs() PodMigrationJobInformer { + return &podMigrationJobInformer{factory: v.factory, tweakListOptions: v.tweakListOptions} +} + // Reservations returns a ReservationInformer. func (v *version) Reservations() ReservationInformer { - return &reservationInformer{factory: v.factory, tweakListOptions: v.tweakListOptions} + return &reservationInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} } diff --git a/pkg/client/informers/externalversions/scheduling/v1alpha1/podmigrationjob.go b/pkg/client/informers/externalversions/scheduling/v1alpha1/podmigrationjob.go new file mode 100644 index 000000000..64dadc106 --- /dev/null +++ b/pkg/client/informers/externalversions/scheduling/v1alpha1/podmigrationjob.go @@ -0,0 +1,89 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "context" + time "time" + + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + versioned "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned" + internalinterfaces "github.com/koordinator-sh/koordinator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha1 "github.com/koordinator-sh/koordinator/pkg/client/listers/scheduling/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" +) + +// PodMigrationJobInformer provides access to a shared informer and lister for +// PodMigrationJobs. +type PodMigrationJobInformer interface { + Informer() cache.SharedIndexInformer + Lister() v1alpha1.PodMigrationJobLister +} + +type podMigrationJobInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// NewPodMigrationJobInformer constructs a new informer for PodMigrationJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewPodMigrationJobInformer(client versioned.Interface, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredPodMigrationJobInformer(client, resyncPeriod, indexers, nil) +} + +// NewFilteredPodMigrationJobInformer constructs a new informer for PodMigrationJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredPodMigrationJobInformer(client versioned.Interface, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.SchedulingV1alpha1().PodMigrationJobs().List(context.TODO(), options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.SchedulingV1alpha1().PodMigrationJobs().Watch(context.TODO(), options) + }, + }, + &schedulingv1alpha1.PodMigrationJob{}, + resyncPeriod, + indexers, + ) +} + +func (f *podMigrationJobInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredPodMigrationJobInformer(client, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *podMigrationJobInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&schedulingv1alpha1.PodMigrationJob{}, f.defaultInformer) +} + +func (f *podMigrationJobInformer) Lister() v1alpha1.PodMigrationJobLister { + return v1alpha1.NewPodMigrationJobLister(f.Informer().GetIndexer()) +} diff --git a/pkg/client/informers/externalversions/scheduling/v1alpha1/reservation.go b/pkg/client/informers/externalversions/scheduling/v1alpha1/reservation.go index d64529e67..4dadebb03 100644 --- a/pkg/client/informers/externalversions/scheduling/v1alpha1/reservation.go +++ b/pkg/client/informers/externalversions/scheduling/v1alpha1/reservation.go @@ -42,32 +42,33 @@ type ReservationInformer interface { type reservationInformer struct { factory internalinterfaces.SharedInformerFactory tweakListOptions internalinterfaces.TweakListOptionsFunc + namespace string } // NewReservationInformer constructs a new informer for Reservation type. // Always prefer using an informer factory to get a shared informer instead of getting an independent // one. This reduces memory footprint and number of connections to the server. -func NewReservationInformer(client versioned.Interface, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { - return NewFilteredReservationInformer(client, resyncPeriod, indexers, nil) +func NewReservationInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredReservationInformer(client, namespace, resyncPeriod, indexers, nil) } // NewFilteredReservationInformer constructs a new informer for Reservation type. // Always prefer using an informer factory to get a shared informer instead of getting an independent // one. This reduces memory footprint and number of connections to the server. -func NewFilteredReservationInformer(client versioned.Interface, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { +func NewFilteredReservationInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { return cache.NewSharedIndexInformer( &cache.ListWatch{ ListFunc: func(options v1.ListOptions) (runtime.Object, error) { if tweakListOptions != nil { tweakListOptions(&options) } - return client.SchedulingV1alpha1().Reservations().List(context.TODO(), options) + return client.SchedulingV1alpha1().Reservations(namespace).List(context.TODO(), options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { tweakListOptions(&options) } - return client.SchedulingV1alpha1().Reservations().Watch(context.TODO(), options) + return client.SchedulingV1alpha1().Reservations(namespace).Watch(context.TODO(), options) }, }, &schedulingv1alpha1.Reservation{}, @@ -77,7 +78,7 @@ func NewFilteredReservationInformer(client versioned.Interface, resyncPeriod tim } func (f *reservationInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { - return NewFilteredReservationInformer(client, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) + return NewFilteredReservationInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) } func (f *reservationInformer) Informer() cache.SharedIndexInformer { diff --git a/pkg/client/listers/scheduling/v1alpha1/device.go b/pkg/client/listers/scheduling/v1alpha1/device.go new file mode 100644 index 000000000..f9b67caf2 --- /dev/null +++ b/pkg/client/listers/scheduling/v1alpha1/device.go @@ -0,0 +1,68 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" +) + +// DeviceLister helps list Devices. +// All objects returned here must be treated as read-only. +type DeviceLister interface { + // List lists all Devices in the indexer. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*v1alpha1.Device, err error) + // Get retrieves the Device from the index for a given name. + // Objects returned here must be treated as read-only. + Get(name string) (*v1alpha1.Device, error) + DeviceListerExpansion +} + +// deviceLister implements the DeviceLister interface. +type deviceLister struct { + indexer cache.Indexer +} + +// NewDeviceLister returns a new DeviceLister. +func NewDeviceLister(indexer cache.Indexer) DeviceLister { + return &deviceLister{indexer: indexer} +} + +// List lists all Devices in the indexer. +func (s *deviceLister) List(selector labels.Selector) (ret []*v1alpha1.Device, err error) { + err = cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha1.Device)) + }) + return ret, err +} + +// Get retrieves the Device from the index for a given name. +func (s *deviceLister) Get(name string) (*v1alpha1.Device, error) { + obj, exists, err := s.indexer.GetByKey(name) + if err != nil { + return nil, err + } + if !exists { + return nil, errors.NewNotFound(v1alpha1.Resource("device"), name) + } + return obj.(*v1alpha1.Device), nil +} diff --git a/pkg/client/listers/scheduling/v1alpha1/expansion_generated.go b/pkg/client/listers/scheduling/v1alpha1/expansion_generated.go index 5ca2bd90e..42d0b243b 100644 --- a/pkg/client/listers/scheduling/v1alpha1/expansion_generated.go +++ b/pkg/client/listers/scheduling/v1alpha1/expansion_generated.go @@ -18,6 +18,18 @@ limitations under the License. package v1alpha1 +// DeviceListerExpansion allows custom methods to be added to +// DeviceLister. +type DeviceListerExpansion interface{} + +// PodMigrationJobListerExpansion allows custom methods to be added to +// PodMigrationJobLister. +type PodMigrationJobListerExpansion interface{} + // ReservationListerExpansion allows custom methods to be added to // ReservationLister. type ReservationListerExpansion interface{} + +// ReservationNamespaceListerExpansion allows custom methods to be added to +// ReservationNamespaceLister. +type ReservationNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/scheduling/v1alpha1/podmigrationjob.go b/pkg/client/listers/scheduling/v1alpha1/podmigrationjob.go new file mode 100644 index 000000000..d979ce7b1 --- /dev/null +++ b/pkg/client/listers/scheduling/v1alpha1/podmigrationjob.go @@ -0,0 +1,68 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" +) + +// PodMigrationJobLister helps list PodMigrationJobs. +// All objects returned here must be treated as read-only. +type PodMigrationJobLister interface { + // List lists all PodMigrationJobs in the indexer. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*v1alpha1.PodMigrationJob, err error) + // Get retrieves the PodMigrationJob from the index for a given name. + // Objects returned here must be treated as read-only. + Get(name string) (*v1alpha1.PodMigrationJob, error) + PodMigrationJobListerExpansion +} + +// podMigrationJobLister implements the PodMigrationJobLister interface. +type podMigrationJobLister struct { + indexer cache.Indexer +} + +// NewPodMigrationJobLister returns a new PodMigrationJobLister. +func NewPodMigrationJobLister(indexer cache.Indexer) PodMigrationJobLister { + return &podMigrationJobLister{indexer: indexer} +} + +// List lists all PodMigrationJobs in the indexer. +func (s *podMigrationJobLister) List(selector labels.Selector) (ret []*v1alpha1.PodMigrationJob, err error) { + err = cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha1.PodMigrationJob)) + }) + return ret, err +} + +// Get retrieves the PodMigrationJob from the index for a given name. +func (s *podMigrationJobLister) Get(name string) (*v1alpha1.PodMigrationJob, error) { + obj, exists, err := s.indexer.GetByKey(name) + if err != nil { + return nil, err + } + if !exists { + return nil, errors.NewNotFound(v1alpha1.Resource("podmigrationjob"), name) + } + return obj.(*v1alpha1.PodMigrationJob), nil +} diff --git a/pkg/client/listers/scheduling/v1alpha1/reservation.go b/pkg/client/listers/scheduling/v1alpha1/reservation.go index b07e042b5..14dee2922 100644 --- a/pkg/client/listers/scheduling/v1alpha1/reservation.go +++ b/pkg/client/listers/scheduling/v1alpha1/reservation.go @@ -31,9 +31,8 @@ type ReservationLister interface { // List lists all Reservations in the indexer. // Objects returned here must be treated as read-only. List(selector labels.Selector) (ret []*v1alpha1.Reservation, err error) - // Get retrieves the Reservation from the index for a given name. - // Objects returned here must be treated as read-only. - Get(name string) (*v1alpha1.Reservation, error) + // Reservations returns an object that can list and get Reservations. + Reservations(namespace string) ReservationNamespaceLister ReservationListerExpansion } @@ -55,9 +54,41 @@ func (s *reservationLister) List(selector labels.Selector) (ret []*v1alpha1.Rese return ret, err } -// Get retrieves the Reservation from the index for a given name. -func (s *reservationLister) Get(name string) (*v1alpha1.Reservation, error) { - obj, exists, err := s.indexer.GetByKey(name) +// Reservations returns an object that can list and get Reservations. +func (s *reservationLister) Reservations(namespace string) ReservationNamespaceLister { + return reservationNamespaceLister{indexer: s.indexer, namespace: namespace} +} + +// ReservationNamespaceLister helps list and get Reservations. +// All objects returned here must be treated as read-only. +type ReservationNamespaceLister interface { + // List lists all Reservations in the indexer for a given namespace. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*v1alpha1.Reservation, err error) + // Get retrieves the Reservation from the indexer for a given namespace and name. + // Objects returned here must be treated as read-only. + Get(name string) (*v1alpha1.Reservation, error) + ReservationNamespaceListerExpansion +} + +// reservationNamespaceLister implements the ReservationNamespaceLister +// interface. +type reservationNamespaceLister struct { + indexer cache.Indexer + namespace string +} + +// List lists all Reservations in the indexer for a given namespace. +func (s reservationNamespaceLister) List(selector labels.Selector) (ret []*v1alpha1.Reservation, err error) { + err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha1.Reservation)) + }) + return ret, err +} + +// Get retrieves the Reservation from the indexer for a given namespace and name. +func (s reservationNamespaceLister) Get(name string) (*v1alpha1.Reservation, error) { + obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) if err != nil { return nil, err } diff --git a/pkg/features/koord_features.go b/pkg/features/koord_features.go deleted file mode 100644 index 749555ed7..000000000 --- a/pkg/features/koord_features.go +++ /dev/null @@ -1,43 +0,0 @@ -/* -Copyright 2022 The Koordinator Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package features - -import ( - "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/component-base/featuregate" -) - -const ( - // NodeMetricControl is responsible for NodeMetric CR reconciliation - NodeMetricControl featuregate.Feature = "NodeMetricControl" - // NodeResourceControl is responsible for node BE allocatable resource calculation and reporting - NodeResourceControl featuregate.Feature = "NodeResourceControl" -) - -func init() { - runtime.Must(defaultKoordCtrlMutableFeatureGate.Add(defaultKoordCtrlFeatureGates)) -} - -var ( - defaultKoordCtrlMutableFeatureGate featuregate.MutableFeatureGate = featuregate.NewFeatureGate() - DefaultKoordCtlFeatureGate featuregate.FeatureGate = defaultKoordCtrlMutableFeatureGate - - defaultKoordCtrlFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{ - NodeMetricControl: {Default: true, PreRelease: featuregate.Beta}, - NodeResourceControl: {Default: false, PreRelease: featuregate.Alpha}, - } -) diff --git a/pkg/koordlet/audit/auditor.go b/pkg/koordlet/audit/auditor.go index cb5bd1e41..0192d101d 100644 --- a/pkg/koordlet/audit/auditor.go +++ b/pkg/koordlet/audit/auditor.go @@ -159,7 +159,7 @@ func (a *auditor) HttpHandler() func(http.ResponseWriter, *http.Request) { } else { activeReader = a.findActiveReader(pageToken) if activeReader == nil { - http.Error(rw, fmt.Sprintf("reader %v is existed", pageToken), http.StatusConflict) + http.Error(rw, fmt.Sprintf("invalid pageToken %s", pageToken), http.StatusConflict) return } } diff --git a/pkg/koordlet/audit/auditor_test.go b/pkg/koordlet/audit/auditor_test.go index 858f5be4a..7f18d05cb 100644 --- a/pkg/koordlet/audit/auditor_test.go +++ b/pkg/koordlet/audit/auditor_test.go @@ -18,50 +18,23 @@ package audit import ( "bytes" - "context" "encoding/json" "fmt" "io/ioutil" - "net" "net/http" + "net/http/httptest" "testing" "time" ) -type TestServer struct { - l net.Listener - server *http.Server -} - -func (t *TestServer) Serve() { - t.server.Serve(t.l) -} - -func (t *TestServer) Shutdown() error { - t.l.Close() - return t.server.Shutdown(context.TODO()) -} - -func (t *TestServer) URL(size int, pageToken string) string { - url := fmt.Sprintf("http://:%d?size=%d", t.l.Addr().(*net.TCPAddr).Port, size) +func makeRequestUrl(size int, serverUrl, pageToken string) string { + url := fmt.Sprintf("%s?size=%d", serverUrl, size) if pageToken != "" { url += fmt.Sprintf("&pageToken=%s", pageToken) } return url } -func mustCreateHttpServer(t *testing.T, handler http.Handler) *TestServer { - l, err := net.Listen("tcp", ":0") - if err != nil { - t.Fatal(err) - } - server := &http.Server{Handler: handler} - return &TestServer{ - l: l, - server: server, - } -} - func TestAuditorLogger(t *testing.T) { tempDir := t.TempDir() @@ -76,31 +49,31 @@ func TestAuditorLogger(t *testing.T) { } logger.Flush() - server := mustCreateHttpServer(t, http.HandlerFunc(auditor.HttpHandler())) - defer server.Shutdown() - go func() { - server.Serve() - }() + server := httptest.NewServer(http.HandlerFunc(auditor.HttpHandler())) + defer server.Close() client := http.Client{} - req, _ := http.NewRequest("GET", server.URL(10, ""), nil) + req, _ := http.NewRequest("GET", makeRequestUrl(10, server.URL, ""), nil) req.Header.Add("Accept", "application/json") resp, err := client.Do(req) + if err != nil { t.Fatalf("failed to get events: %v", err) } + defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) response := &JsonResponse{} if err := json.Unmarshal(body, response); err != nil { t.Fatal(err) } + if len(response.Events) != 10 { t.Errorf("failed to load events, expected %d actual %d", 10, len(response.Events)) } // continue read logs - req, _ = http.NewRequest("GET", server.URL(1, response.NextPageToken), nil) + req, _ = http.NewRequest("GET", makeRequestUrl(1, server.URL, response.NextPageToken), nil) req.Header.Add("Accept", "application/json") resp, err = client.Do(req) if err != nil { @@ -124,7 +97,7 @@ func TestAuditorLogger(t *testing.T) { count := 0 stepSize := 5 for { - req, _ = http.NewRequest("GET", server.URL(stepSize, response.NextPageToken), nil) + req, _ = http.NewRequest("GET", makeRequestUrl(stepSize, server.URL, response.NextPageToken), nil) req.Header.Add("Accept", "application/json") resp, err = client.Do(req) if err != nil { @@ -146,7 +119,6 @@ func TestAuditorLogger(t *testing.T) { t.Errorf("failed to read to the end, expected %v actual %v", len(blocks)-11, count) } }() - } func TestAuditorLoggerTxtOutput(t *testing.T) { @@ -163,14 +135,11 @@ func TestAuditorLoggerTxtOutput(t *testing.T) { } logger.Flush() - server := mustCreateHttpServer(t, http.HandlerFunc(auditor.HttpHandler())) - defer server.Shutdown() - go func() { - server.Serve() - }() + server := httptest.NewServer(http.HandlerFunc(auditor.HttpHandler())) + defer server.Close() client := http.Client{} - req, _ := http.NewRequest("GET", server.URL(10, ""), nil) + req, _ := http.NewRequest("GET", makeRequestUrl(10, server.URL, ""), nil) resp, err := client.Do(req) if err != nil { t.Fatalf("failed to get events: %v", err) @@ -204,14 +173,11 @@ func TestAuditorLoggerReaderInvalidPageToken(t *testing.T) { } logger.Flush() - server := mustCreateHttpServer(t, http.HandlerFunc(auditor.HttpHandler())) - defer server.Shutdown() - go func() { - server.Serve() - }() + server := httptest.NewServer(http.HandlerFunc(auditor.HttpHandler())) + defer server.Close() client := http.Client{} - req, _ := http.NewRequest("GET", server.URL(10, ""), nil) + req, _ := http.NewRequest("GET", makeRequestUrl(10, server.URL, ""), nil) resp, err := client.Do(req) if err != nil { t.Fatalf("failed to get events: %v", err) @@ -235,7 +201,7 @@ func TestAuditorLoggerReaderInvalidPageToken(t *testing.T) { time.Sleep(time.Second) // request with expired token - req, _ = http.NewRequest("GET", server.URL(10, nextPageTokens[0]), nil) + req, _ = http.NewRequest("GET", makeRequestUrl(10, server.URL, nextPageTokens[0]), nil) resp, err = client.Do(req) if err != nil { t.Fatalf("failed to get events: %v", err) @@ -245,7 +211,7 @@ func TestAuditorLoggerReaderInvalidPageToken(t *testing.T) { } // request with not exists token - req, _ = http.NewRequest("GET", server.URL(10, "not-exists-token"), nil) + req, _ = http.NewRequest("GET", makeRequestUrl(10, server.URL, "not-exists-token"), nil) resp, err = client.Do(req) if err != nil { t.Fatalf("failed to get events: %v", err) @@ -269,16 +235,13 @@ func TestAuditorLoggerMaxActiveReaders(t *testing.T) { } logger.Flush() - server := mustCreateHttpServer(t, http.HandlerFunc(ad.HttpHandler())) - defer server.Shutdown() - go func() { - server.Serve() - }() + server := httptest.NewServer(http.HandlerFunc(ad.HttpHandler())) + defer server.Close() client := http.Client{} for i := 0; i < c.MaxConcurrentReaders+5; i++ { - req, _ := http.NewRequest("GET", server.URL(10, ""), nil) + req, _ := http.NewRequest("GET", makeRequestUrl(10, server.URL, ""), nil) resp, err := client.Do(req) if err != nil { t.Fatalf("failed to get events: %v", err) diff --git a/pkg/koordlet/audit/config.go b/pkg/koordlet/audit/config.go index c26c159d1..974f14853 100644 --- a/pkg/koordlet/audit/config.go +++ b/pkg/koordlet/audit/config.go @@ -46,7 +46,7 @@ func NewDefaultConfig() *Config { } func (c *Config) InitFlags(fs *flag.FlagSet) { - fs.StringVar(&c.LogDir, "AuditLogDir", c.LogDir, "The dir of audit log") - fs.IntVar(&c.Verbose, "AuditVerbose", c.Verbose, "The verbose of the audit log") - fs.IntVar(&c.MaxDiskSpaceMB, "AuditMaxDiskSpaceMB", c.MaxDiskSpaceMB, "Max disk space occupied of audit log") + fs.StringVar(&c.LogDir, "audit-log-dir", c.LogDir, "The dir of audit log") + fs.IntVar(&c.Verbose, "audit-verbose", c.Verbose, "The verbose of the audit log") + fs.IntVar(&c.MaxDiskSpaceMB, "audit-max-disk-space-mb", c.MaxDiskSpaceMB, "Max disk space occupied of audit log") } diff --git a/pkg/koordlet/audit/config_test.go b/pkg/koordlet/audit/config_test.go new file mode 100644 index 000000000..d0a009b76 --- /dev/null +++ b/pkg/koordlet/audit/config_test.go @@ -0,0 +1,92 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package audit + +import ( + "flag" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func Test_NewDefaultConfig(t *testing.T) { + expectConfig := &Config{ + LogDir: "/var/log/koordlet", + Verbose: 3, + MaxDiskSpaceMB: 16, + MaxConcurrentReaders: 4, + ActiveReaderTTL: time.Minute * 10, + DefaultEventsLimit: 256, + MaxEventsLimit: 2048, + TickerDuration: time.Minute, + } + defaultConfig := NewDefaultConfig() + assert.Equal(t, expectConfig, defaultConfig) +} + +func Test_InitFlags(t *testing.T) { + cmdArgs := []string{ + "", + "--audit-log-dir=/tmp/log/koordlet", + "--audit-verbose=4", + "--audit-max-disk-space-mb=32", + } + fs := flag.NewFlagSet(cmdArgs[0], flag.ExitOnError) + + type fields struct { + LogDir string + Verbose int + MaxDiskSpaceMB int + } + type args struct { + fs *flag.FlagSet + } + tests := []struct { + name string + fields fields + args args + }{ + { + name: "not default", + fields: fields{ + LogDir: "/tmp/log/koordlet", + Verbose: 4, + MaxDiskSpaceMB: 32, + }, + args: args{fs: fs}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + raw := &Config{ + LogDir: tt.fields.LogDir, + Verbose: tt.fields.Verbose, + MaxDiskSpaceMB: tt.fields.MaxDiskSpaceMB, + MaxConcurrentReaders: 4, + ActiveReaderTTL: time.Minute * 10, + DefaultEventsLimit: 256, + MaxEventsLimit: 2048, + TickerDuration: time.Minute, + } + c := NewDefaultConfig() + c.InitFlags(tt.args.fs) + tt.args.fs.Parse(cmdArgs[1:]) + assert.Equal(t, raw, c) + }) + } +} diff --git a/pkg/koordlet/metriccache/api.go b/pkg/koordlet/metriccache/api.go index a84b3bff6..9ed255f11 100644 --- a/pkg/koordlet/metriccache/api.go +++ b/pkg/koordlet/metriccache/api.go @@ -26,6 +26,13 @@ type CPUMetric struct { CPUUsed resource.Quantity } +type GPUMetric struct { + Minor int32 // index starting from 0 + DeviceUUID string // device UUID + SMUtil uint32 // current utilization rate for the device + MemoryUsed resource.Quantity // used memory on the device, in bytes +} + type MemoryMetric struct { MemoryWithoutCache resource.Quantity } @@ -37,6 +44,7 @@ type CPUThrottledMetric struct { type NodeResourceMetric struct { CPUUsed CPUMetric MemoryUsed MemoryMetric + GPUs []GPUMetric } type NodeResourceQueryResult struct { @@ -48,6 +56,7 @@ type PodResourceMetric struct { PodUID string CPUUsed CPUMetric MemoryUsed MemoryMetric + GPUs []GPUMetric } type PodResourceQueryResult struct { @@ -59,6 +68,7 @@ type ContainerResourceMetric struct { ContainerID string CPUUsed CPUMetric MemoryUsed MemoryMetric + GPUs []GPUMetric } type ContainerResourceQueryResult struct { diff --git a/pkg/koordlet/metriccache/config.go b/pkg/koordlet/metriccache/config.go index 5f22af3be..a77b02573 100644 --- a/pkg/koordlet/metriccache/config.go +++ b/pkg/koordlet/metriccache/config.go @@ -31,6 +31,6 @@ func NewDefaultConfig() *Config { } func (c *Config) InitFlags(fs *flag.FlagSet) { - fs.IntVar(&c.MetricGCIntervalSeconds, "MetricGCIntervalSeconds", c.MetricGCIntervalSeconds, "Collect node metrics interval by seconds") - fs.IntVar(&c.MetricExpireSeconds, "MetricExpireSeconds", c.MetricExpireSeconds, "Collect pod metrics interval by seconds") + fs.IntVar(&c.MetricGCIntervalSeconds, "metric-gc-interval-seconds", c.MetricGCIntervalSeconds, "Collect node metrics interval by seconds") + fs.IntVar(&c.MetricExpireSeconds, "metric-expire-seconds", c.MetricExpireSeconds, "Collect pod metrics interval by seconds") } diff --git a/pkg/koordlet/metriccache/config_test.go b/pkg/koordlet/metriccache/config_test.go new file mode 100644 index 000000000..c69bcee1c --- /dev/null +++ b/pkg/koordlet/metriccache/config_test.go @@ -0,0 +1,76 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metriccache + +import ( + "flag" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_NewDefaultConfig(t *testing.T) { + expectConfig := &Config{ + MetricGCIntervalSeconds: 300, + MetricExpireSeconds: 1800, + } + defaultConfig := NewDefaultConfig() + assert.Equal(t, expectConfig, defaultConfig) +} + +func Test_InitFlags(t *testing.T) { + cmdArgs := []string{ + "", + "--metric-gc-interval-seconds=100", + "--metric-expire-seconds=600", + } + fs := flag.NewFlagSet(cmdArgs[0], flag.ExitOnError) + + type fields struct { + MetricGCIntervalSeconds int + MetricExpireSeconds int + } + type args struct { + fs *flag.FlagSet + } + tests := []struct { + name string + fields fields + args args + }{ + { + name: "not default", + fields: fields{ + MetricGCIntervalSeconds: 100, + MetricExpireSeconds: 600, + }, + args: args{fs: fs}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + raw := &Config{ + MetricGCIntervalSeconds: tt.fields.MetricGCIntervalSeconds, + MetricExpireSeconds: tt.fields.MetricExpireSeconds, + } + c := NewDefaultConfig() + c.InitFlags(tt.args.fs) + tt.args.fs.Parse(cmdArgs[1:]) + assert.Equal(t, raw, c) + }) + } +} diff --git a/pkg/koordlet/metriccache/metric_cache.go b/pkg/koordlet/metriccache/metric_cache.go index a4dda09eb..762c5a24b 100644 --- a/pkg/koordlet/metriccache/metric_cache.go +++ b/pkg/koordlet/metriccache/metric_cache.go @@ -29,6 +29,8 @@ import ( type AggregationType string +type AggregationFunc func(interface{}, AggregateParam) (float64, error) + const ( AggregationTypeAVG AggregationType = "AVG" AggregationTypeP90 AggregationType = "P90" @@ -132,6 +134,25 @@ func (m *metricCache) GetNodeResourceMetric(param *QueryParam) NodeResourceQuery return result } + // gpu metrics time series. + // m.GPUs is a slice. + gpuUsagesByTime := make([][]gpuResourceMetric, 0) + for _, m := range metrics { + if len(m.GPUs) == 0 { + continue + } + gpuUsagesByTime = append(gpuUsagesByTime, m.GPUs) + } + + var aggregateGPUMetrics []GPUMetric + if len(gpuUsagesByTime) > 0 { + aggregateGPUMetrics, err = m.aggregateGPUUsages(gpuUsagesByTime, aggregateFunc) + if err != nil { + result.Error = fmt.Errorf("get node aggregate GPUMetric failed, metrics %v, error %v", metrics, err) + return result + } + } + count, err := count(metrics) if err != nil { result.Error = fmt.Errorf("get node aggregate count failed, metrics %v, error %v", metrics, err) @@ -146,6 +167,7 @@ func (m *metricCache) GetNodeResourceMetric(param *QueryParam) NodeResourceQuery MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(int64(memoryUsed), resource.BinarySI), }, + GPUs: aggregateGPUMetrics, } return result @@ -181,6 +203,25 @@ func (m *metricCache) GetPodResourceMetric(podUID *string, param *QueryParam) Po return result } + // gpu metrics time series. + // m.GPUs is a slice. + gpuUsagesByTime := make([][]gpuResourceMetric, 0) + for _, m := range metrics { + if len(m.GPUs) == 0 { + continue + } + gpuUsagesByTime = append(gpuUsagesByTime, m.GPUs) + } + + var aggregateGPUMetrics []GPUMetric + if len(gpuUsagesByTime) > 0 { + aggregateGPUMetrics, err = m.aggregateGPUUsages(gpuUsagesByTime, aggregateFunc) + if err != nil { + result.Error = fmt.Errorf("get pod aggregate GPUMetric failed, metrics %v, error %v", metrics, err) + return result + } + } + count, err := count(metrics) if err != nil { result.Error = fmt.Errorf("get node aggregate count failed, metrics %v, error %v", metrics, err) @@ -196,7 +237,9 @@ func (m *metricCache) GetPodResourceMetric(podUID *string, param *QueryParam) Po MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(int64(memoryUsed), resource.BinarySI), }, + GPUs: aggregateGPUMetrics, } + return result } @@ -226,17 +269,36 @@ func (m *metricCache) GetContainerResourceMetric(containerID *string, param *Que } memoryUsed, err := aggregateFunc(metrics, AggregateParam{ValueFieldName: "MemoryUsedBytes", TimeFieldName: "Timestamp"}) if err != nil { - result.Error = fmt.Errorf("get pod %v aggregate MemoryUsedBytes failed, metrics %v, error %v", + result.Error = fmt.Errorf("get container %v aggregate MemoryUsedBytes failed, metrics %v, error %v", containerID, metrics, err) return result } count, err := count(metrics) if err != nil { - result.Error = fmt.Errorf("get node aggregate count failed, metrics %v, error %v", metrics, err) + result.Error = fmt.Errorf("get container aggregate count failed, metrics %v, error %v", metrics, err) return result } + // gpu metrics time series. + // m.GPUs is a slice. + gpuUsagesByTime := make([][]gpuResourceMetric, 0) + for _, m := range metrics { + if len(m.GPUs) == 0 { + continue + } + gpuUsagesByTime = append(gpuUsagesByTime, m.GPUs) + } + + var aggregateGPUMetrics []GPUMetric + if len(gpuUsagesByTime) > 0 { + aggregateGPUMetrics, err = m.aggregateGPUUsages(gpuUsagesByTime, aggregateFunc) + if err != nil { + result.Error = fmt.Errorf("get container aggregate GPUMetric failed, metrics %v, error %v", metrics, err) + return result + } + } + result.AggregateInfo = &AggregateInfo{MetricsCount: int64(count)} result.Metric = &ContainerResourceMetric{ ContainerID: *containerID, @@ -246,6 +308,7 @@ func (m *metricCache) GetContainerResourceMetric(containerID *string, param *Que MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(int64(memoryUsed), resource.BinarySI), }, + GPUs: aggregateGPUMetrics, } return result } @@ -404,29 +467,64 @@ func (m *metricCache) GetContainerThrottledMetric(containerID *string, param *Qu } func (m *metricCache) InsertNodeResourceMetric(t time.Time, nodeResUsed *NodeResourceMetric) error { + gpuUsages := make([]gpuResourceMetric, len(nodeResUsed.GPUs)) + for idx, usage := range nodeResUsed.GPUs { + gpuUsages[idx] = gpuResourceMetric{ + DeviceUUID: usage.DeviceUUID, + Minor: usage.Minor, + SMUtil: float64(usage.SMUtil), + MemoryUsed: float64(usage.MemoryUsed.Value()), + Timestamp: t, + } + } + dbItem := &nodeResourceMetric{ CPUUsedCores: float64(nodeResUsed.CPUUsed.CPUUsed.MilliValue()) / 1000, MemoryUsedBytes: float64(nodeResUsed.MemoryUsed.MemoryWithoutCache.Value()), + GPUs: gpuUsages, Timestamp: t, } return m.db.InsertNodResourceMetric(dbItem) } func (m *metricCache) InsertPodResourceMetric(t time.Time, podResUsed *PodResourceMetric) error { + gpuUsages := make([]gpuResourceMetric, len(podResUsed.GPUs)) + for idx, usage := range podResUsed.GPUs { + gpuUsages[idx] = gpuResourceMetric{ + DeviceUUID: usage.DeviceUUID, + Minor: usage.Minor, + SMUtil: float64(usage.SMUtil), + MemoryUsed: float64(usage.MemoryUsed.Value()), + Timestamp: t, + } + } + dbItem := &podResourceMetric{ PodUID: podResUsed.PodUID, CPUUsedCores: float64(podResUsed.CPUUsed.CPUUsed.MilliValue()) / 1000, MemoryUsedBytes: float64(podResUsed.MemoryUsed.MemoryWithoutCache.Value()), + GPUs: gpuUsages, Timestamp: t, } return m.db.InsertPodResourceMetric(dbItem) } func (m *metricCache) InsertContainerResourceMetric(t time.Time, containerResUsed *ContainerResourceMetric) error { + gpuUsages := make([]gpuResourceMetric, len(containerResUsed.GPUs)) + for idx, usage := range containerResUsed.GPUs { + gpuUsages[idx] = gpuResourceMetric{ + DeviceUUID: usage.DeviceUUID, + Minor: usage.Minor, + SMUtil: float64(usage.SMUtil), + MemoryUsed: float64(usage.MemoryUsed.Value()), + Timestamp: t, + } + } dbItem := &containerResourceMetric{ ContainerID: containerResUsed.ContainerID, CPUUsedCores: float64(containerResUsed.CPUUsed.CPUUsed.MilliValue()) / 1000, MemoryUsedBytes: float64(containerResUsed.MemoryUsed.MemoryWithoutCache.Value()), + GPUs: gpuUsages, Timestamp: t, } return m.db.InsertContainerResourceMetric(dbItem) @@ -474,6 +572,49 @@ func (m *metricCache) InsertContainerThrottledMetrics(t time.Time, metric *Conta return m.db.InsertContainerThrottledMetric(dbItem) } +func (m *metricCache) aggregateGPUUsages(gpuResourceMetricsByTime [][]gpuResourceMetric, aggregateFunc AggregationFunc) ([]GPUMetric, error) { + if len(gpuResourceMetricsByTime) == 0 { + return nil, nil + } + deviceCount := len(gpuResourceMetricsByTime[0]) + // keep order by device minor. + gpuUsageByDevice := make([][]gpuResourceMetric, deviceCount) + for _, deviceMetrics := range gpuResourceMetricsByTime { + if len(deviceMetrics) != deviceCount { + return nil, fmt.Errorf("aggregateGPUUsages %v error: inconsistent time series dimensions, deviceCount %d", deviceMetrics, deviceCount) + } + for devIdx, m := range deviceMetrics { + gpuUsageByDevice[devIdx] = append(gpuUsageByDevice[devIdx], m) + } + } + + metrics := make([]GPUMetric, 0) + for _, v := range gpuUsageByDevice { + if len(v) == 0 { + continue + } + smutil, err := aggregateFunc(v, AggregateParam{ValueFieldName: "SMUtil", TimeFieldName: "Timestamp"}) + if err != nil { + return nil, err + } + + memoryUsed, err := aggregateFunc(v, AggregateParam{ValueFieldName: "MemoryUsed", TimeFieldName: "Timestamp"}) + if err != nil { + return nil, err + } + + g := GPUMetric{ + DeviceUUID: v[len(v)-1].DeviceUUID, + Minor: v[len(v)-1].Minor, + SMUtil: uint32(smutil), + MemoryUsed: *resource.NewQuantity(int64(memoryUsed), resource.BinarySI), + } + metrics = append(metrics, g) + } + + return metrics, nil +} + func (m *metricCache) recycleDB() { now := time.Now() oldTime := time.Unix(0, 0) @@ -500,7 +641,7 @@ func (m *metricCache) recycleDB() { klog.Infof("expired metric data before %v has been recycled", expiredTime) } -func getAggregateFunc(aggregationType AggregationType) func(interface{}, AggregateParam) (float64, error) { +func getAggregateFunc(aggregationType AggregationType) AggregationFunc { switch aggregationType { case AggregationTypeAVG: return fieldAvgOfMetricList diff --git a/pkg/koordlet/metriccache/metric_cache_test.go b/pkg/koordlet/metriccache/metric_cache_test.go index 381df72cc..2607fd6be 100644 --- a/pkg/koordlet/metriccache/metric_cache_test.go +++ b/pkg/koordlet/metriccache/metric_cache_test.go @@ -21,9 +21,8 @@ import ( "testing" "time" - "k8s.io/apimachinery/pkg/api/resource" - "github.com/koordinator-sh/koordinator/pkg/util" + "k8s.io/apimachinery/pkg/api/resource" ) func Test_metricCache_NodeResourceMetric_CRUD(t *testing.T) { @@ -54,6 +53,10 @@ func Test_metricCache_NodeResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(30, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 80, MemoryUsed: *resource.NewQuantity(30, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 40, MemoryUsed: *resource.NewQuantity(50, resource.BinarySI)}, + }, }, now.Add(-time.Second * 10): { CPUUsed: CPUMetric{ @@ -62,6 +65,10 @@ func Test_metricCache_NodeResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(10, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 60, MemoryUsed: *resource.NewQuantity(40, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 50, MemoryUsed: *resource.NewQuantity(40, resource.BinarySI)}, + }, }, now.Add(-time.Second * 5): { CPUUsed: CPUMetric{ @@ -70,6 +77,10 @@ func Test_metricCache_NodeResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(20, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 70, MemoryUsed: *resource.NewQuantity(50, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 60, MemoryUsed: *resource.NewQuantity(30, resource.BinarySI)}, + }, }, }, }, @@ -81,6 +92,10 @@ func Test_metricCache_NodeResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(20, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 70, MemoryUsed: *resource.NewQuantity(40, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 50, MemoryUsed: *resource.NewQuantity(40, resource.BinarySI)}, + }, }, QueryResult: QueryResult{AggregateInfo: &AggregateInfo{MetricsCount: 3}}, }, @@ -92,6 +107,10 @@ func Test_metricCache_NodeResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(15, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 65, MemoryUsed: *resource.NewQuantity(45, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 55, MemoryUsed: *resource.NewQuantity(35, resource.BinarySI)}, + }, }, QueryResult: QueryResult{AggregateInfo: &AggregateInfo{MetricsCount: 2}}, }, @@ -304,6 +323,10 @@ func Test_metricCache_ContainerResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(30, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 80, MemoryUsed: *resource.NewQuantity(30, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 40, MemoryUsed: *resource.NewQuantity(50, resource.BinarySI)}, + }, }, now.Add(-time.Second * 10): { ContainerID: "container-id-1", @@ -313,6 +336,10 @@ func Test_metricCache_ContainerResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(10, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 60, MemoryUsed: *resource.NewQuantity(40, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 50, MemoryUsed: *resource.NewQuantity(40, resource.BinarySI)}, + }, }, now.Add(-time.Second * 5): { ContainerID: "container-id-1", @@ -322,6 +349,10 @@ func Test_metricCache_ContainerResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(20, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 70, MemoryUsed: *resource.NewQuantity(50, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 60, MemoryUsed: *resource.NewQuantity(30, resource.BinarySI)}, + }, }, now.Add(-time.Second * 4): { ContainerID: "container-id-2", @@ -343,6 +374,10 @@ func Test_metricCache_ContainerResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(20, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 70, MemoryUsed: *resource.NewQuantity(40, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 50, MemoryUsed: *resource.NewQuantity(40, resource.BinarySI)}, + }, }, QueryResult: QueryResult{AggregateInfo: &AggregateInfo{MetricsCount: 3}}, }, @@ -355,6 +390,10 @@ func Test_metricCache_ContainerResourceMetric_CRUD(t *testing.T) { MemoryUsed: MemoryMetric{ MemoryWithoutCache: *resource.NewQuantity(15, resource.BinarySI), }, + GPUs: []GPUMetric{ + {DeviceUUID: "1", Minor: 0, SMUtil: 65, MemoryUsed: *resource.NewQuantity(45, resource.BinarySI)}, + {DeviceUUID: "2", Minor: 1, SMUtil: 55, MemoryUsed: *resource.NewQuantity(35, resource.BinarySI)}, + }, }, QueryResult: QueryResult{AggregateInfo: &AggregateInfo{MetricsCount: 2}}, }, @@ -458,7 +497,6 @@ func Test_metricCache_ContainerResourceMetric_CRUD(t *testing.T) { if got.Error != nil { t.Errorf("get container metric failed %v", got.Error) } - if !reflect.DeepEqual(got, tt.want) { t.Errorf("GetContainerResourceMetric() got = %v, want %v", got, tt.want) } @@ -938,3 +976,140 @@ func Test_metricCache_PodThrottledMetric_CRUD(t *testing.T) { }) } } + +func Test_metricCache_aggregateGPUUsages(t *testing.T) { + type fields struct { + config *Config + } + type args struct { + gpuResourceMetrics [][]gpuResourceMetric + aggregateFunc AggregationFunc + } + tests := []struct { + name string + fields fields + args args + want []GPUMetric + wantErr bool + }{ + { + name: "sample device", + fields: fields{ + config: &Config{ + MetricGCIntervalSeconds: 60, + MetricExpireSeconds: 60, + }, + }, + args: args{ + aggregateFunc: getAggregateFunc(AggregationTypeAVG), + gpuResourceMetrics: [][]gpuResourceMetric{ + { + {DeviceUUID: "1-1", Minor: 0, SMUtil: 20, MemoryUsed: 1000}, + {DeviceUUID: "2-1", Minor: 1, SMUtil: 40, MemoryUsed: 2000}, + }, + { + {DeviceUUID: "1-1", Minor: 0, SMUtil: 40, MemoryUsed: 4000}, + {DeviceUUID: "2-1", Minor: 1, SMUtil: 30, MemoryUsed: 1000}, + }, + }, + }, + want: []GPUMetric{ + {DeviceUUID: "1-1", Minor: 0, SMUtil: 30, MemoryUsed: *resource.NewQuantity(2500, resource.BinarySI)}, + {DeviceUUID: "2-1", Minor: 1, SMUtil: 35, MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI)}, + }, + }, + + { + name: "difference device", + fields: fields{ + config: &Config{ + MetricGCIntervalSeconds: 60, + MetricExpireSeconds: 60, + }, + }, + args: args{ + aggregateFunc: getAggregateFunc(AggregationTypeAVG), + gpuResourceMetrics: [][]gpuResourceMetric{ + { + {DeviceUUID: "1-1", Minor: 0, SMUtil: 20, MemoryUsed: 1000}, + {DeviceUUID: "2-1", Minor: 1, SMUtil: 40, MemoryUsed: 4000}, + }, + { + {DeviceUUID: "3-1", Minor: 2, SMUtil: 40, MemoryUsed: 4000}, + {DeviceUUID: "4-1", Minor: 3, SMUtil: 30, MemoryUsed: 1000}, + }, + }, + }, + want: []GPUMetric{ + {DeviceUUID: "3-1", Minor: 2, SMUtil: 30, MemoryUsed: *resource.NewQuantity(2500, resource.BinarySI)}, + {DeviceUUID: "4-1", Minor: 3, SMUtil: 35, MemoryUsed: *resource.NewQuantity(2500, resource.BinarySI)}, + }, + }, + { + name: "single device", + fields: fields{ + config: &Config{ + MetricGCIntervalSeconds: 60, + MetricExpireSeconds: 60, + }, + }, + args: args{ + aggregateFunc: getAggregateFunc(AggregationTypeAVG), + gpuResourceMetrics: [][]gpuResourceMetric{ + { + {DeviceUUID: "2-1", Minor: 1, SMUtil: 40, MemoryUsed: 2000}, + }, + { + {DeviceUUID: "2-1", Minor: 1, SMUtil: 30, MemoryUsed: 1000}, + }, + }, + }, + want: []GPUMetric{ + {DeviceUUID: "2-1", Minor: 1, SMUtil: 35, MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI)}, + }, + }, + { + name: "single device and multiple device", + fields: fields{ + config: &Config{ + MetricGCIntervalSeconds: 60, + MetricExpireSeconds: 60, + }, + }, + args: args{ + aggregateFunc: getAggregateFunc(AggregationTypeAVG), + gpuResourceMetrics: [][]gpuResourceMetric{ + { + {DeviceUUID: "1-1", Minor: 0, SMUtil: 20, MemoryUsed: 1000}, + {DeviceUUID: "3-1", Minor: 3, SMUtil: 40, MemoryUsed: 2000}, + }, + { + {DeviceUUID: "1-1", Minor: 0, SMUtil: 40, MemoryUsed: 1000}, + {DeviceUUID: "3-1", Minor: 3, SMUtil: 30, MemoryUsed: 1000}, + }, + }, + }, + want: []GPUMetric{ + {DeviceUUID: "1-1", Minor: 0, SMUtil: 30, MemoryUsed: *resource.NewQuantity(1000, resource.BinarySI)}, + {DeviceUUID: "3-1", Minor: 3, SMUtil: 35, MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI)}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s, _ := NewStorage() + m := &metricCache{ + config: tt.fields.config, + db: s, + } + got, err := m.aggregateGPUUsages(tt.args.gpuResourceMetrics, tt.args.aggregateFunc) + if (err != nil) != tt.wantErr { + t.Errorf("metricCache.aggregateGPUUsages() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("metricCache.aggregateGPUUsages() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/koordlet/metriccache/storage_tables.go b/pkg/koordlet/metriccache/storage_tables.go index 6f96ea4e9..df263945a 100644 --- a/pkg/koordlet/metriccache/storage_tables.go +++ b/pkg/koordlet/metriccache/storage_tables.go @@ -17,6 +17,10 @@ limitations under the License. package metriccache import ( + "database/sql/driver" + "encoding/json" + "errors" + "fmt" "time" ) @@ -24,10 +28,43 @@ const ( NodeCPUInfoRecordType = "NodeCPUInfo" ) +type gpuResourceMetric struct { + Minor int32 // index starting from 0 + DeviceUUID string // device UUID + SMUtil float64 // current utilization rate for the device + MemoryUsed float64 // used memory on the device, in bytes + Timestamp time.Time +} + +type GPUMetricsArray []gpuResourceMetric + +// Implement gorm customize data type. +// Read data from database. +func (array *GPUMetricsArray) Scan(value interface{}) error { + if value == nil { + return nil + } + bytes, ok := value.([]byte) + if !ok { + return errors.New(fmt.Sprint("Failed to unmarshal JSONB value:", value)) + } + return json.Unmarshal(bytes, array) +} + +// Implement gorm customize data type. +// Write data to database. +func (array GPUMetricsArray) Value() (driver.Value, error) { + if array == nil { + return nil, nil + } + return json.Marshal(array) +} + type nodeResourceMetric struct { ID uint64 `gorm:"primarykey"` CPUUsedCores float64 MemoryUsedBytes float64 + GPUs GPUMetricsArray `gorm:"type:text"` Timestamp time.Time } @@ -36,6 +73,7 @@ type podResourceMetric struct { PodUID string `gorm:"index:idx_pod_res_uid"` CPUUsedCores float64 MemoryUsedBytes float64 + GPUs GPUMetricsArray `gorm:"type:text"` Timestamp time.Time } @@ -44,6 +82,7 @@ type containerResourceMetric struct { ContainerID string `gorm:"index:idx_container_res_uid"` CPUUsedCores float64 MemoryUsedBytes float64 + GPUs GPUMetricsArray `gorm:"type:text"` Timestamp time.Time } diff --git a/pkg/koordlet/metricsadvisor/collector.go b/pkg/koordlet/metricsadvisor/collector.go index 2ee03df7e..e6f20cfcc 100644 --- a/pkg/koordlet/metricsadvisor/collector.go +++ b/pkg/koordlet/metricsadvisor/collector.go @@ -17,7 +17,7 @@ limitations under the License. package metricsadvisor import ( - "fmt" + "bytes" "os" "os/exec" "strconv" @@ -46,8 +46,7 @@ const ( var ( // jiffies is the duration unit of CPU stats - jiffies = float64(10 * time.Millisecond) - + jiffies = float64(10 * time.Millisecond) localCPUInfoGetter = util.GetLocalCPUInfo ) @@ -71,6 +70,8 @@ type collectContext struct { lastPodCPUThrottled sync.Map lastContainerCPUThrottled sync.Map + + gpuDeviceManager GPUDeviceManager } func newCollectContext() *collectContext { @@ -79,6 +80,7 @@ func newCollectContext() *collectContext { lastContainerCPUStat: sync.Map{}, lastPodCPUThrottled: sync.Map{}, lastContainerCPUThrottled: sync.Map{}, + gpuDeviceManager: initGPUDeviceManager(), } } @@ -101,6 +103,7 @@ func NewCollector(cfg *Config, statesInformer statesinformer.StatesInformer, met if c.config == nil { c.config = NewDefaultConfig() } + return c } @@ -110,6 +113,7 @@ func (c *collector) HasSynced() bool { func (c *collector) Run(stopCh <-chan struct{}) error { defer utilruntime.HandleCrash() + defer c.context.gpuDeviceManager.shutdown() klog.Info("Starting collector for NodeMetric") defer klog.Info("shutting down daemon") if c.config.CollectResUsedIntervalSeconds <= 0 { @@ -125,6 +129,9 @@ func (c *collector) Run(stopCh <-chan struct{}) error { } go wait.Until(func() { + + // collect gpu metrics. + c.collectGPUUsage() c.collectNodeResUsed() // add sync metaService cache check before collect pod information // because collect function will get all pods. @@ -148,26 +155,24 @@ func (c *collector) Run(stopCh <-chan struct{}) error { return nil } +// initJiffies use command "getconf CLK_TCK" to fetch the clock tick on current host, +// if the command doesn't exist, uses the default value 10ms for jiffies func initJiffies() error { - // retrieve jiffies - clkTckStdout, err := exec.Command("getconf", "CLK_TCK").Output() + getconf, err := exec.LookPath("getconf") if err != nil { - return err - } - clkTckStdoutStrs := strings.Split(string(clkTckStdout), "\n") - if len(clkTckStdoutStrs) <= 0 { - return fmt.Errorf("getconf CLK_TCK returns empty") + return nil } - clkTckStdoutStr := strings.Fields(clkTckStdoutStrs[0]) - if len(clkTckStdoutStr) <= 0 { - return fmt.Errorf("getconf CLK_TCK returns empty") + cmd := exec.Command(getconf, "CLK_TCK") + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return err } - clkTck, err := strconv.Atoi(clkTckStdoutStr[0]) + ticks, err := strconv.ParseFloat(strings.TrimSpace(out.String()), 64) if err != nil { return err } - // clkTck (Hz) - jiffies = float64(time.Second / time.Duration(clkTck)) + jiffies = float64(time.Second / time.Duration(ticks)) return nil } @@ -192,6 +197,7 @@ func (c *collector) collectNodeResUsed() { // 1 jiffies could be 10ms // NOTICE: do subtraction and division first to avoid overflow cpuUsageValue := float64(currentCPUTick-lastCPUStat.cpuTick) / float64(collectTime.Sub(lastCPUStat.ts)) * jiffies + nodeMetric := metriccache.NodeResourceMetric{ CPUUsed: metriccache.CPUMetric{ // 1.0 CPU = 1000 Milli-CPU @@ -203,6 +209,8 @@ func (c *collector) collectNodeResUsed() { }, } + nodeMetric.GPUs = c.context.gpuDeviceManager.getNodeGPUUsage() + if err := c.metricCache.InsertNodeResourceMetric(collectTime, &nodeMetric); err != nil { klog.Errorf("insert node resource metric error: %v", err) } @@ -222,6 +230,7 @@ func (c *collector) collectPodResUsed() { collectTime := time.Now() currentCPUUsage, err0 := util.GetPodCPUUsageNanoseconds(meta.CgroupDir) memUsageValue, err1 := util.GetPodMemStatUsageBytes(meta.CgroupDir) + if err0 != nil || err1 != nil { // higher verbosity for probably non-running pods if pod.Status.Phase != corev1.PodRunning && pod.Status.Phase != corev1.PodPending { @@ -242,6 +251,7 @@ func (c *collector) collectPodResUsed() { klog.Infof("ignore the first cpu stat collection for pod %s/%s", pod.Namespace, pod.Name) continue } + lastCPUStat := lastCPUStatValue.(contextRecord) // NOTICE: do subtraction and division first to avoid overflow cpuUsageValue := float64(currentCPUUsage-lastCPUStat.cpuUsage) / float64(collectTime.Sub(lastCPUStat.ts)) @@ -256,6 +266,13 @@ func (c *collector) collectPodResUsed() { MemoryWithoutCache: *resource.NewQuantity(memUsageValue, resource.BinarySI), }, } + + if gpus, err := c.context.gpuDeviceManager.getPodGPUUsage(meta.CgroupDir, meta.Pod.Status.ContainerStatuses); err == nil { + podMetric.GPUs = gpus + } else { + klog.Errorf("get pod %s/%s gpu usage error: %v", meta.Pod.Namespace, meta.Pod.Name, err) + } + klog.V(6).Infof("collect pod %s/%s, uid %s finished, metric %+v", meta.Pod.Namespace, meta.Pod.Name, meta.Pod.UID, podMetric) @@ -279,6 +296,7 @@ func (c *collector) collectContainerResUsed(meta *statesinformer.PodMeta) { collectTime := time.Now() currentCPUUsage, err0 := util.GetContainerCPUUsageNanoseconds(meta.CgroupDir, containerStat) memUsageValue, err1 := util.GetContainerMemStatUsageBytes(meta.CgroupDir, containerStat) + if err0 != nil || err1 != nil { // higher verbosity for probably non-running pods if containerStat.State.Running == nil { @@ -290,6 +308,7 @@ func (c *collector) collectContainerResUsed(meta *statesinformer.PodMeta) { } continue } + lastCPUStatValue, ok := c.context.lastContainerCPUStat.Load(containerStat.ContainerID) c.context.lastContainerCPUStat.Store(containerStat.ContainerID, contextRecord{ cpuUsage: currentCPUUsage, @@ -314,6 +333,13 @@ func (c *collector) collectContainerResUsed(meta *statesinformer.PodMeta) { MemoryWithoutCache: *resource.NewQuantity(memUsageValue, resource.BinarySI), }, } + + if gpus, err := c.context.gpuDeviceManager.getContainerGPUUsage(meta.CgroupDir, containerStat); err == nil { + containerMetric.GPUs = gpus + } else { + klog.Errorf("get container %s/%s/%s gpu usage error: %v", pod.Namespace, pod.Name, containerStat.Name, err) + } + klog.V(6).Infof("collect container %s/%s/%s, id %s finished, metric %+v", meta.Pod.Namespace, meta.Pod.Name, containerStat.Name, meta.Pod.UID, containerMetric) if err := c.metricCache.InsertContainerResourceMetric(collectTime, &containerMetric); err != nil { @@ -362,7 +388,7 @@ func (c *collector) collectPodThrottledInfo() { if err != nil || currentCPUStat == nil { if pod.Status.Phase == corev1.PodRunning { // print running pod collection error - klog.Infof("collect pod %s/%s, uid %v cpu throttled failed, err %v, metric %v", + klog.V(4).Infof("collect pod %s/%s, uid %v cpu throttled failed, err %v, metric %v", pod.Namespace, pod.Name, uid, err, currentCPUStat) } continue @@ -402,7 +428,7 @@ func (c *collector) collectContainerThrottledInfo(podMeta *statesinformer.PodMet collectTime := time.Now() containerStat := &pod.Status.ContainerStatuses[i] if len(containerStat.ContainerID) == 0 { - klog.Infof("container %s/%s/%s id is empty, maybe not ready, skip this round", + klog.V(4).Infof("container %s/%s/%s id is empty, maybe not ready, skip this round", pod.Namespace, pod.Name, containerStat.Name) continue } @@ -414,7 +440,7 @@ func (c *collector) collectContainerThrottledInfo(podMeta *statesinformer.PodMet } currentCPUStat, err := system.GetCPUStatRaw(containerCgroupPath) if err != nil { - klog.Infof("collect container %s/%s/%s cpu throttled failed, err %v, metric %v", + klog.V(4).Infof("collect container %s/%s/%s cpu throttled failed, err %v, metric %v", pod.Namespace, pod.Name, containerStat.Name, err, currentCPUStat) continue } diff --git a/pkg/koordlet/metricsadvisor/collector_gpu.go b/pkg/koordlet/metricsadvisor/collector_gpu.go new file mode 100644 index 000000000..ce2858000 --- /dev/null +++ b/pkg/koordlet/metricsadvisor/collector_gpu.go @@ -0,0 +1,269 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metricsadvisor + +import ( + "errors" + "fmt" + "sort" + "sync" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/koordinator-sh/koordinator/pkg/util" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" + + "github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache" +) + +type GPUDeviceManager interface { + collectGPUUsage() + getNodeGPUUsage() []metriccache.GPUMetric + getPodGPUUsage(podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.GPUMetric, error) + getContainerGPUUsage(podParentDir string, c *corev1.ContainerStatus) ([]metriccache.GPUMetric, error) + shutdown() error +} + +type dummyDeviceManager struct{} + +func (d *dummyDeviceManager) collectGPUUsage() {} + +func (d *dummyDeviceManager) getNodeGPUUsage() []metriccache.GPUMetric { + return nil +} + +func (d *dummyDeviceManager) getPodGPUUsage(podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.GPUMetric, error) { + return nil, nil +} + +func (d *dummyDeviceManager) getContainerGPUUsage(podParentDir string, c *corev1.ContainerStatus) ([]metriccache.GPUMetric, error) { + return nil, nil +} + +func (d *dummyDeviceManager) shutdown() error { + return nil +} + +type gpuDeviceManager struct { + sync.RWMutex + deviceCount int + devices []*device + collectTime time.Time + processesMetrics map[uint32][]*rawGPUMetric +} + +type rawGPUMetric struct { + SMUtil uint32 // current utilization rate for the device + MemoryUsed uint64 +} + +type device struct { + Minor int32 // index starting from 0 + DeviceUUID string + Device nvml.Device +} + +// initGPUDeviceManager will not retry if init fails, +func initGPUDeviceManager() GPUDeviceManager { + if ret := nvml.Init(); ret != nvml.SUCCESS { + if ret == nvml.ERROR_LIBRARY_NOT_FOUND { + klog.Warning("nvml init failed, library not found") + return &dummyDeviceManager{} + } + klog.Warningf("nvml init failed, return %s", nvml.ErrorString(ret)) + return &dummyDeviceManager{} + } + manager := &gpuDeviceManager{} + if err := manager.initGPUData(); err != nil { + klog.Warningf("nvml init gpu data, error %s", err) + manager.shutdown() + return &dummyDeviceManager{} + } + + return manager +} + +func (g *gpuDeviceManager) shutdown() error { + rt := nvml.Shutdown() + if rt != nvml.SUCCESS { + return fmt.Errorf("nvml shutdown error, code: %s", nvml.ErrorString(rt)) + } + return nil +} + +func (g *gpuDeviceManager) initGPUData() error { + count, ret := nvml.DeviceGetCount() + if ret != nvml.SUCCESS { + return fmt.Errorf("unable to get device count: %v", nvml.ErrorString(ret)) + } + if count == 0 { + return errors.New("no gpu device found") + } + devices := make([]*device, count) + for deviceIndex := 0; deviceIndex < count; deviceIndex++ { + gpudevice, ret := nvml.DeviceGetHandleByIndex(deviceIndex) + if ret != nvml.SUCCESS { + return fmt.Errorf("unable to get device at index %d: %v", deviceIndex, nvml.ErrorString(ret)) + } + + uuid, ret := gpudevice.GetUUID() + if ret != nvml.SUCCESS { + return fmt.Errorf("unable to get device uuid: %v", nvml.ErrorString(ret)) + } + + minor, ret := gpudevice.GetMinorNumber() + if ret != nvml.SUCCESS { + return fmt.Errorf("unable to get device minor number: %v", nvml.ErrorString(ret)) + } + devices[deviceIndex] = &device{ + DeviceUUID: uuid, + Minor: int32(minor), + Device: gpudevice, + } + } + + g.Lock() + defer g.Unlock() + g.deviceCount = count + g.devices = devices + return nil +} + +func (g *gpuDeviceManager) getNodeGPUUsage() []metriccache.GPUMetric { + g.RLock() + defer g.RUnlock() + tmp := make([]rawGPUMetric, g.deviceCount) + for i := 0; i < g.deviceCount; i++ { + tmp[i] = rawGPUMetric{} + } + for _, p := range g.processesMetrics { + for idx := 0; idx < g.deviceCount; idx++ { + if m := p[uint32(idx)]; m != nil { + tmp[idx].SMUtil += p[uint32(idx)].SMUtil + tmp[idx].MemoryUsed += p[uint32(idx)].MemoryUsed + } + } + } + rtn := make([]metriccache.GPUMetric, g.deviceCount) + for i := 0; i < g.deviceCount; i++ { + rtn[i] = metriccache.GPUMetric{ + DeviceUUID: g.devices[i].DeviceUUID, + Minor: g.devices[i].Minor, + SMUtil: tmp[i].SMUtil, + MemoryUsed: *resource.NewQuantity(int64(tmp[i].MemoryUsed), resource.BinarySI), + } + } + return rtn +} + +func (g *gpuDeviceManager) getTotalGPUUsageOfPIDs(pids []uint64) []metriccache.GPUMetric { + g.RLock() + defer g.RUnlock() + tmp := make(map[int]*rawGPUMetric) + for _, pid := range pids { + if metrics, exist := g.processesMetrics[uint32(pid)]; exist { + for idx, metric := range metrics { + if metric == nil { + continue + } + if _, found := tmp[idx]; !found { + tmp[idx] = &rawGPUMetric{} + } + tmp[idx].MemoryUsed += metric.MemoryUsed + tmp[idx].SMUtil += metric.SMUtil + } + } + } + if len(tmp) == 0 { + return nil + } + rtn := make([]metriccache.GPUMetric, 0) + for i := 0; i < g.deviceCount; i++ { + if value, ok := tmp[i]; ok { + rtn = append(rtn, metriccache.GPUMetric{ + DeviceUUID: g.devices[i].DeviceUUID, + Minor: g.devices[i].Minor, + SMUtil: value.SMUtil, + MemoryUsed: *resource.NewQuantity(int64(value.MemoryUsed), resource.BinarySI), + }) + } + } + return rtn +} + +func (g *gpuDeviceManager) getPodGPUUsage(podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.GPUMetric, error) { + pids, err := util.GetPIDsInPod(podParentDir, cs) + if err != nil { + return nil, fmt.Errorf("failed to get pid, error: %v", err) + } + return g.getTotalGPUUsageOfPIDs(pids), nil +} + +func (g *gpuDeviceManager) getContainerGPUUsage(podParentDir string, c *corev1.ContainerStatus) ([]metriccache.GPUMetric, error) { + currentPIDs, err := util.GetPIDsInContainer(podParentDir, c) + if err != nil { + return nil, fmt.Errorf("failed to get pid, error: %v", err) + } + return g.getTotalGPUUsageOfPIDs(currentPIDs), nil +} + +func (g *gpuDeviceManager) collectGPUUsage() { + processesGPUUsages := make(map[uint32][]*rawGPUMetric) + for deviceIndex, gpuDevice := range g.devices { + processesInfos, ret := gpuDevice.Device.GetComputeRunningProcesses() + if ret != nvml.SUCCESS { + klog.Warningf("Unable to get process info for device at index %d: %v", deviceIndex, nvml.ErrorString(ret)) + continue + } + processUtilizations, ret := gpuDevice.Device.GetProcessUtilization(1024) + if ret != nvml.SUCCESS { + klog.Warningf("Unable to get process utilization for device at index %d: %v", deviceIndex, nvml.ErrorString(ret)) + continue + } + + // Sort by pid. + sort.Slice(processesInfos, func(i, j int) bool { + return processesInfos[i].Pid < processesInfos[j].Pid + }) + sort.Slice(processUtilizations, func(i, j int) bool { + return processUtilizations[i].Pid < processUtilizations[j].Pid + }) + + klog.V(3).Infof("Found %d processes on device %d\n", len(processesInfos), deviceIndex) + for idx, info := range processesInfos { + if _, ok := processesGPUUsages[info.Pid]; !ok { + // pid not exist. + // init processes gpu metric array. + processesGPUUsages[info.Pid] = make([]*rawGPUMetric, g.deviceCount) + } + processesGPUUsages[info.Pid][deviceIndex] = &rawGPUMetric{ + SMUtil: processUtilizations[idx].SmUtil, + MemoryUsed: info.UsedGpuMemory, + } + } + } + g.Lock() + g.processesMetrics = processesGPUUsages + g.collectTime = time.Now() + g.Unlock() +} + +func (c *collector) collectGPUUsage() { + c.context.gpuDeviceManager.collectGPUUsage() +} diff --git a/pkg/koordlet/metricsadvisor/collector_gpu_test.go b/pkg/koordlet/metricsadvisor/collector_gpu_test.go new file mode 100644 index 000000000..ae7653c8e --- /dev/null +++ b/pkg/koordlet/metricsadvisor/collector_gpu_test.go @@ -0,0 +1,528 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metricsadvisor + +import ( + "io/ioutil" + "os" + "path" + "reflect" + "testing" + + "github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache" + "github.com/koordinator-sh/koordinator/pkg/util/system" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +func Test_gpuUsageDetailRecord_GetNodeGPUUsage(t *testing.T) { + type fields struct { + deviceCount int + devices []*device + processesMetrics map[uint32][]*rawGPUMetric + } + tests := []struct { + name string + fields fields + want []metriccache.GPUMetric + }{ + { + name: "single device", + fields: fields{ + deviceCount: 1, + devices: []*device{ + {Minor: 0, DeviceUUID: "test-device1"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}}, + }, + }, + want: []metriccache.GPUMetric{ + { + DeviceUUID: "test-device1", + Minor: 0, + SMUtil: 70, + MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI), + }, + }, + }, + { + name: "multiple device", + fields: fields{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "test-device1"}, + {Minor: 1, DeviceUUID: "test-device2"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, nil}, + 222: {nil, {SMUtil: 50, MemoryUsed: 1000}}, + }, + }, + want: []metriccache.GPUMetric{ + { + DeviceUUID: "test-device1", + Minor: 0, + SMUtil: 70, + MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI), + }, + { + DeviceUUID: "test-device2", + Minor: 1, + SMUtil: 50, + MemoryUsed: *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + { + name: "process on multiple device", + fields: fields{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "test-device1"}, + {Minor: 1, DeviceUUID: "test-device2"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, {SMUtil: 30, MemoryUsed: 1000}}, + 222: {{SMUtil: 20, MemoryUsed: 1000}, {SMUtil: 50, MemoryUsed: 1000}}, + }, + }, + want: []metriccache.GPUMetric{ + { + DeviceUUID: "test-device1", + Minor: 0, + SMUtil: 90, + MemoryUsed: *resource.NewQuantity(2500, resource.BinarySI), + }, + { + DeviceUUID: "test-device2", + Minor: 1, + SMUtil: 80, + MemoryUsed: *resource.NewQuantity(2000, resource.BinarySI), + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := &gpuDeviceManager{ + deviceCount: tt.fields.deviceCount, + devices: tt.fields.devices, + processesMetrics: tt.fields.processesMetrics, + } + if got := g.getNodeGPUUsage(); !reflect.DeepEqual(got, tt.want) { + t.Errorf("gpuUsageDetailRecord.GetNodeGPUUsage() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_gpuUsageDetailRecord_GetPIDsTotalGPUUsage(t *testing.T) { + type fields struct { + deviceCount int + devices []*device + processesMetrics map[uint32][]*rawGPUMetric + } + type args struct { + pids []uint64 + } + tests := []struct { + name string + fields fields + args args + want []metriccache.GPUMetric + }{ + { + name: "single device", + args: args{ + pids: []uint64{122}, + }, + fields: fields{ + deviceCount: 1, + devices: []*device{ + {Minor: 0, DeviceUUID: "test-device1"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}}, + 123: {{SMUtil: 20, MemoryUsed: 1000}}, + }, + }, + want: []metriccache.GPUMetric{ + { + DeviceUUID: "test-device1", + Minor: 0, + SMUtil: 70, + MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI), + }, + }, + }, + { + name: "multiple device", + args: args{ + pids: []uint64{122, 222}, + }, + fields: fields{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "test-device1"}, + {Minor: 1, DeviceUUID: "test-device2"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, nil}, + 222: {nil, {SMUtil: 50, MemoryUsed: 1000}}, + }, + }, + want: []metriccache.GPUMetric{ + { + DeviceUUID: "test-device1", + Minor: 0, + SMUtil: 70, + MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI), + }, + { + DeviceUUID: "test-device2", + Minor: 1, + SMUtil: 50, + MemoryUsed: *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + { + name: "multiple device-1", + args: args{ + pids: []uint64{122}, + }, + fields: fields{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "test-device1"}, + {Minor: 1, DeviceUUID: "test-device2"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, nil}, + 222: {nil, {SMUtil: 50, MemoryUsed: 1000}}, + }, + }, + want: []metriccache.GPUMetric{ + { + DeviceUUID: "test-device1", + Minor: 0, + SMUtil: 70, + MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI), + }, + }, + }, + { + name: "multiple device and multiple processes", + args: args{ + pids: []uint64{122, 222}, + }, + fields: fields{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "test-device1"}, + {Minor: 1, DeviceUUID: "test-device2"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, {SMUtil: 50, MemoryUsed: 1000}}, + 222: {{SMUtil: 10, MemoryUsed: 1000}, {SMUtil: 40, MemoryUsed: 3000}}, + }, + }, + want: []metriccache.GPUMetric{ + { + DeviceUUID: "test-device1", + Minor: 0, + SMUtil: 80, + MemoryUsed: *resource.NewQuantity(2500, resource.BinarySI), + }, + { + DeviceUUID: "test-device2", + Minor: 1, + SMUtil: 90, + MemoryUsed: *resource.NewQuantity(4000, resource.BinarySI), + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := &gpuDeviceManager{ + deviceCount: tt.fields.deviceCount, + devices: tt.fields.devices, + processesMetrics: tt.fields.processesMetrics, + } + if got := g.getTotalGPUUsageOfPIDs(tt.args.pids); !reflect.DeepEqual(got, tt.want) { + t.Errorf("gpuUsageDetailRecord.GetPIDsTotalGPUUsage() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_gpuDeviceManager_getPodGPUUsage(t *testing.T) { + + system.SetupCgroupPathFormatter(system.Systemd) + defer system.SetupCgroupPathFormatter(system.Systemd) + dir := t.TempDir() + system.Conf.CgroupRootDir = dir + + p1 := "/cpu/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/docker-703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf.scope/cgroup.procs" + p1CgroupPath := path.Join(dir, p1) + if err := writeCgroupContent(p1CgroupPath, []byte("122\n222")); err != nil { + t.Fatal(err) + } + + p2 := "/cpu/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/docker-703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4acff.scope/cgroup.procs" + p2CgroupPath := path.Join(dir, p2) + if err := writeCgroupContent(p2CgroupPath, []byte("45\n67")); err != nil { + t.Fatal(err) + } + + type fields struct { + gpuDeviceManager GPUDeviceManager + } + type args struct { + podParentDir string + cs []corev1.ContainerStatus + } + tests := []struct { + name string + fields fields + args args + want []metriccache.GPUMetric + wantErr bool + }{ + { + name: "multiple processes and multiple device", + fields: fields{ + gpuDeviceManager: &gpuDeviceManager{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "12"}, + {Minor: 1, DeviceUUID: "23"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, {SMUtil: 50, MemoryUsed: 1000}}, + 222: {{SMUtil: 10, MemoryUsed: 1000}, {SMUtil: 40, MemoryUsed: 3000}}, + }, + }, + }, + args: args{ + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + cs: []corev1.ContainerStatus{ + { + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf", + }, + { + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4acff", + }, + }, + }, + want: []metriccache.GPUMetric{ + {Minor: 0, DeviceUUID: "12", SMUtil: 80, MemoryUsed: *resource.NewQuantity(2500, resource.BinarySI)}, + {Minor: 1, DeviceUUID: "23", SMUtil: 90, MemoryUsed: *resource.NewQuantity(4000, resource.BinarySI)}, + }, + wantErr: false, + }, + { + name: "multiple processes", + fields: fields{ + gpuDeviceManager: &gpuDeviceManager{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "12"}, + {Minor: 1, DeviceUUID: "23"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, nil}, + 222: {nil, {SMUtil: 40, MemoryUsed: 3000}}, + }, + }, + }, + args: args{ + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + cs: []corev1.ContainerStatus{ + { + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf", + }, + { + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4acff", + }, + }, + }, + want: []metriccache.GPUMetric{ + {Minor: 0, DeviceUUID: "12", SMUtil: 70, MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI)}, + {Minor: 1, DeviceUUID: "23", SMUtil: 40, MemoryUsed: *resource.NewQuantity(3000, resource.BinarySI)}, + }, + wantErr: false, + }, + + { + name: "single processes", + fields: fields{ + gpuDeviceManager: &gpuDeviceManager{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "12"}, + {Minor: 1, DeviceUUID: "23"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {nil, {SMUtil: 70, MemoryUsed: 1500}}, + 222: {nil, {SMUtil: 20, MemoryUsed: 3000}}, + }, + }, + }, + args: args{ + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + cs: []corev1.ContainerStatus{ + { + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf", + }, + { + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4acff", + }, + }, + }, + want: []metriccache.GPUMetric{ + {Minor: 1, DeviceUUID: "23", SMUtil: 90, MemoryUsed: *resource.NewQuantity(4500, resource.BinarySI)}, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := tt.fields.gpuDeviceManager + got, err := g.getPodGPUUsage(tt.args.podParentDir, tt.args.cs) + if (err != nil) != tt.wantErr { + t.Errorf("gpuDeviceManager.getPodGPUUsage() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("gpuDeviceManager.getPodGPUUsage() = %v, want %v", got, tt.want) + } + }) + } +} +func Test_gpuDeviceManager_getContainerGPUUsage(t *testing.T) { + + system.SetupCgroupPathFormatter(system.Systemd) + defer system.SetupCgroupPathFormatter(system.Systemd) + dir := t.TempDir() + system.Conf.CgroupRootDir = dir + + p1 := "/cpu/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/docker-703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf.scope/cgroup.procs" + p1CgroupPath := path.Join(dir, p1) + if err := writeCgroupContent(p1CgroupPath, []byte("122\n222")); err != nil { + t.Fatal(err) + } + + p2 := "/cpu/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/docker-703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4acff.scope/cgroup.procs" + p2CgroupPath := path.Join(dir, p2) + if err := writeCgroupContent(p2CgroupPath, []byte("122")); err != nil { + t.Fatal(err) + } + + type fields struct { + gpuDeviceManager GPUDeviceManager + } + type args struct { + podParentDir string + c *corev1.ContainerStatus + } + tests := []struct { + name string + fields fields + args args + want []metriccache.GPUMetric + wantErr bool + }{ + { + name: "multiple processes and multiple device", + fields: fields{ + gpuDeviceManager: &gpuDeviceManager{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "12"}, + {Minor: 1, DeviceUUID: "23"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, {SMUtil: 50, MemoryUsed: 1000}}, + 222: {{SMUtil: 10, MemoryUsed: 1000}, {SMUtil: 40, MemoryUsed: 3000}}, + }, + }, + }, + args: args{ + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + c: &corev1.ContainerStatus{ + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf", + }, + }, + want: []metriccache.GPUMetric{ + {Minor: 0, DeviceUUID: "12", SMUtil: 80, MemoryUsed: *resource.NewQuantity(2500, resource.BinarySI)}, + {Minor: 1, DeviceUUID: "23", SMUtil: 90, MemoryUsed: *resource.NewQuantity(4000, resource.BinarySI)}, + }, + wantErr: false, + }, + { + name: "single processes and multiple device", + fields: fields{ + gpuDeviceManager: &gpuDeviceManager{ + deviceCount: 2, + devices: []*device{ + {Minor: 0, DeviceUUID: "12"}, + {Minor: 1, DeviceUUID: "23"}, + }, + processesMetrics: map[uint32][]*rawGPUMetric{ + 122: {{SMUtil: 70, MemoryUsed: 1500}, {SMUtil: 50, MemoryUsed: 1000}}, + 222: {{SMUtil: 10, MemoryUsed: 1000}, {SMUtil: 40, MemoryUsed: 3000}}, + }, + }, + }, + args: args{ + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + c: &corev1.ContainerStatus{ + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4acff", + }, + }, + want: []metriccache.GPUMetric{ + {Minor: 0, DeviceUUID: "12", SMUtil: 70, MemoryUsed: *resource.NewQuantity(1500, resource.BinarySI)}, + {Minor: 1, DeviceUUID: "23", SMUtil: 50, MemoryUsed: *resource.NewQuantity(1000, resource.BinarySI)}, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := tt.fields.gpuDeviceManager + got, err := g.getContainerGPUUsage(tt.args.podParentDir, tt.args.c) + if (err != nil) != tt.wantErr { + t.Errorf("gpuDeviceManager.getContainerGPUUsage() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("gpuDeviceManager.getContainerGPUUsage() = %v, want %v", got, tt.want) + } + }) + } +} + +func writeCgroupContent(filePath string, content []byte) error { + err := os.MkdirAll(path.Dir(filePath), os.ModePerm) + if err != nil { + return err + } + return ioutil.WriteFile(filePath, content, 0655) +} diff --git a/pkg/koordlet/metricsadvisor/config.go b/pkg/koordlet/metricsadvisor/config.go index dd45b4c5b..5fc27022f 100644 --- a/pkg/koordlet/metricsadvisor/config.go +++ b/pkg/koordlet/metricsadvisor/config.go @@ -31,6 +31,6 @@ func NewDefaultConfig() *Config { } func (c *Config) InitFlags(fs *flag.FlagSet) { - fs.IntVar(&c.CollectResUsedIntervalSeconds, "CollectResUsedIntervalSeconds", c.CollectResUsedIntervalSeconds, "Collect node/pod resource usage interval by seconds") - fs.IntVar(&c.CollectNodeCPUInfoIntervalSeconds, "CollectNodeCPUInfoIntervalSeconds", c.CollectNodeCPUInfoIntervalSeconds, "Collect node cpu info interval by seconds") + fs.IntVar(&c.CollectResUsedIntervalSeconds, "collect-res-used-interval-seconds", c.CollectResUsedIntervalSeconds, "Collect node/pod resource usage interval by seconds") + fs.IntVar(&c.CollectNodeCPUInfoIntervalSeconds, "collect-node-cpu-info-interval-seconds", c.CollectNodeCPUInfoIntervalSeconds, "Collect node cpu info interval by seconds") } diff --git a/pkg/koordlet/metricsadvisor/config_test.go b/pkg/koordlet/metricsadvisor/config_test.go index 82592b2f9..998fa260b 100644 --- a/pkg/koordlet/metricsadvisor/config_test.go +++ b/pkg/koordlet/metricsadvisor/config_test.go @@ -17,6 +17,7 @@ limitations under the License. package metricsadvisor import ( + "flag" "testing" "github.com/stretchr/testify/assert" @@ -30,3 +31,46 @@ func Test_NewDefaultConfig(t *testing.T) { defaultConfig := NewDefaultConfig() assert.Equal(t, expectConfig, defaultConfig) } + +func Test_InitFlags(t *testing.T) { + cmdArgs := []string{ + "", + "--collect-res-used-interval-seconds=3", + "--collect-node-cpu-info-interval-seconds=90", + } + fs := flag.NewFlagSet(cmdArgs[0], flag.ExitOnError) + + type fields struct { + CollectResUsedIntervalSeconds int + CollectNodeCPUInfoIntervalSeconds int + } + type args struct { + fs *flag.FlagSet + } + tests := []struct { + name string + fields fields + args args + }{ + { + name: "not default", + fields: fields{ + CollectResUsedIntervalSeconds: 3, + CollectNodeCPUInfoIntervalSeconds: 90, + }, + args: args{fs: fs}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + raw := &Config{ + CollectResUsedIntervalSeconds: tt.fields.CollectResUsedIntervalSeconds, + CollectNodeCPUInfoIntervalSeconds: tt.fields.CollectNodeCPUInfoIntervalSeconds, + } + c := NewDefaultConfig() + c.InitFlags(tt.args.fs) + tt.args.fs.Parse(cmdArgs[1:]) + assert.Equal(t, raw, c) + }) + } +} diff --git a/pkg/koordlet/pleg/pleg.go b/pkg/koordlet/pleg/pleg.go index 59265bc24..1cc11a843 100644 --- a/pkg/koordlet/pleg/pleg.go +++ b/pkg/koordlet/pleg/pleg.go @@ -24,6 +24,7 @@ import ( "k8s.io/klog/v2" "github.com/koordinator-sh/koordinator/pkg/util" + "github.com/koordinator-sh/koordinator/pkg/util/system" ) const ( @@ -134,7 +135,9 @@ func (p *pleg) RemoverHandler(id HandlerID) PodLifeCycleHandler { func (p *pleg) Run(stopCh <-chan struct{}) error { qosClasses := []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} for _, qosClass := range qosClasses { - cgroupPath := path.Join(p.cgroupRootPath, util.GetPodQoSRelativePath(qosClass)) + // here we choose cpu subsystem as ground truth, + // since we only need to watch one of all subsystems, and cpu subsystem always and must exist + cgroupPath := path.Join(p.cgroupRootPath, system.CgroupCPUDir, util.GetPodQoSRelativePath(qosClass)) err := p.podWatcher.AddWatch(cgroupPath) if err != nil { klog.Errorf("failed to watch path %v err %v", cgroupPath, err) diff --git a/pkg/koordlet/reporter/config.go b/pkg/koordlet/reporter/config.go index a7245bd07..a5fc0de5c 100644 --- a/pkg/koordlet/reporter/config.go +++ b/pkg/koordlet/reporter/config.go @@ -29,5 +29,5 @@ func NewDefaultConfig() *Config { } func (c *Config) InitFlags(fs *flag.FlagSet) { - fs.IntVar(&c.ReportIntervalSeconds, "ReportIntervalSeconds", c.ReportIntervalSeconds, "Report interval by seconds") + fs.IntVar(&c.ReportIntervalSeconds, "report-interval-seconds", c.ReportIntervalSeconds, "Report interval by seconds") } diff --git a/pkg/koordlet/reporter/config_test.go b/pkg/koordlet/reporter/config_test.go new file mode 100644 index 000000000..72929c752 --- /dev/null +++ b/pkg/koordlet/reporter/config_test.go @@ -0,0 +1,67 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reporter + +import ( + "flag" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_NewDefaultConfig(t *testing.T) { + expectConfig := &Config{ + ReportIntervalSeconds: 60, + } + defaultConfig := NewDefaultConfig() + assert.Equal(t, expectConfig, defaultConfig) +} + +func Test_InitFlags(t *testing.T) { + cmdArgs := []string{ + "", + "--report-interval-seconds=30", + } + fs := flag.NewFlagSet(cmdArgs[0], flag.ExitOnError) + + type fields struct { + ReportIntervalSeconds int + } + type args struct { + fs *flag.FlagSet + } + tests := []struct { + name string + fields fields + args args + }{ + { + name: "not default", + fields: fields{ReportIntervalSeconds: 30}, + args: args{fs: fs}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + raw := &Config{ReportIntervalSeconds: tt.fields.ReportIntervalSeconds} + c := NewDefaultConfig() + c.InitFlags(tt.args.fs) + tt.args.fs.Parse(cmdArgs[1:]) + assert.Equal(t, raw, c) + }) + } +} diff --git a/pkg/koordlet/reporter/reporter.go b/pkg/koordlet/reporter/reporter.go index 0a6df0837..988a74c46 100644 --- a/pkg/koordlet/reporter/reporter.go +++ b/pkg/koordlet/reporter/reporter.go @@ -211,7 +211,7 @@ func (r *reporter) sync() { if retErr != nil { klog.Warningf("update node metric status failed, status %v, err %v", util.DumpJSON(newStatus), retErr) } else { - klog.Infof("update node metric status success, detail: %v", util.DumpJSON(newStatus)) + klog.V(4).Infof("update node metric status success, detail: %v", util.DumpJSON(newStatus)) } } diff --git a/pkg/koordlet/resmanager/cgroup_reconcile.go b/pkg/koordlet/resmanager/cgroup_reconcile.go index 1e6554665..678125a4c 100644 --- a/pkg/koordlet/resmanager/cgroup_reconcile.go +++ b/pkg/koordlet/resmanager/cgroup_reconcile.go @@ -69,9 +69,9 @@ func (m *CgroupResourcesReconcile) RunInit(stopCh <-chan struct{}) error { func (m *CgroupResourcesReconcile) reconcile() { nodeSLO := m.resmanager.getNodeSLOCopy() - if nodeSLO == nil || nodeSLO.Spec.ResourceQoSStrategy == nil { - // do nothing if nodeSLO == nil || nodeSLO.Spec.ResourceQoSStrategy == nil - klog.Warning("nodeSLO or nodeSLO.Spec.ResourceQoSStrategy is nil %v", util.DumpJSON(nodeSLO)) + if nodeSLO == nil || nodeSLO.Spec.ResourceQOSStrategy == nil { + // do nothing if nodeSLO == nil || nodeSLO.Spec.ResourceQOSStrategy == nil + klog.Warning("nodeSLO or nodeSLO.Spec.ResourceQOSStrategy is nil %v", util.DumpJSON(nodeSLO)) return } @@ -99,7 +99,7 @@ func (m *CgroupResourcesReconcile) calculateAndUpdateResources(nodeSLO *slov1alp podMetas := m.resmanager.statesInformer.GetAllPods() // calculate qos-level, pod-level and container-level resources - qosResources, podResources, containerResources := m.calculateResources(nodeSLO.Spec.ResourceQoSStrategy, node, podMetas) + qosResources, podResources, containerResources := m.calculateResources(nodeSLO.Spec.ResourceQOSStrategy, node, podMetas) // to make sure the hierarchical cgroup resources are correctly updated, we simply update the resources by // cgroup-level order. @@ -112,7 +112,7 @@ func (m *CgroupResourcesReconcile) calculateAndUpdateResources(nodeSLO *slov1alp } // calculateResources calculates qos-level, pod-level and container-level resources with nodeCfg and podMetas -func (m *CgroupResourcesReconcile) calculateResources(nodeCfg *slov1alpha1.ResourceQoSStrategy, node *corev1.Node, +func (m *CgroupResourcesReconcile) calculateResources(nodeCfg *slov1alpha1.ResourceQOSStrategy, node *corev1.Node, podMetas []*statesinformer.PodMeta) (qosLevelResources, podLevelResources, containerLevelResources []MergeableResourceUpdater) { // TODO: check anolis os version qosSummary := map[corev1.PodQOSClass]*cgroupResourceSummary{ @@ -162,7 +162,7 @@ func (m *CgroupResourcesReconcile) calculateResources(nodeCfg *slov1alpha1.Resou } func (m *CgroupResourcesReconcile) calculateQoSResources(summary *cgroupResourceSummary, qos corev1.PodQOSClass, - qosCfg *slov1alpha1.ResourceQoS) []MergeableResourceUpdater { + qosCfg *slov1alpha1.ResourceQOS) []MergeableResourceUpdater { // double-check qosCfg is not nil if qosCfg == nil { klog.Warningf("calculateQoSResources aborts since qos config is %v", qosCfg) @@ -172,17 +172,17 @@ func (m *CgroupResourcesReconcile) calculateQoSResources(summary *cgroupResource qosDir := util.GetKubeQosRelativePath(qos) // Mem QoS - if qosCfg.MemoryQoS != nil { - summary.memoryUsePriorityOom = qosCfg.MemoryQoS.PriorityEnable - summary.memoryPriority = qosCfg.MemoryQoS.Priority - summary.memoryOomKillGroup = qosCfg.MemoryQoS.OomKillGroup + if qosCfg.MemoryQOS != nil { + summary.memoryUsePriorityOom = qosCfg.MemoryQOS.PriorityEnable + summary.memoryPriority = qosCfg.MemoryQOS.Priority + summary.memoryOomKillGroup = qosCfg.MemoryQOS.OomKillGroup } return makeCgroupResources(GroupOwnerRef(string(qos)), qosDir, summary) } func (m *CgroupResourcesReconcile) calculatePodAndContainerResources(podMeta *statesinformer.PodMeta, node *corev1.Node, - podCfg *slov1alpha1.ResourceQoS) (podResources, containerResources []MergeableResourceUpdater) { + podCfg *slov1alpha1.ResourceQOS) (podResources, containerResources []MergeableResourceUpdater) { pod := podMeta.Pod podDir := util.GetPodCgroupDirWithKube(podMeta.CgroupDir) @@ -208,7 +208,7 @@ func (m *CgroupResourcesReconcile) calculatePodAndContainerResources(podMeta *st return } -func (m *CgroupResourcesReconcile) calculatePodResources(pod *corev1.Pod, parentDir string, podCfg *slov1alpha1.ResourceQoS) []MergeableResourceUpdater { +func (m *CgroupResourcesReconcile) calculatePodResources(pod *corev1.Pod, parentDir string, podCfg *slov1alpha1.ResourceQOS) []MergeableResourceUpdater { // double-check qos config is not nil if podCfg == nil { klog.V(5).Infof("calculatePodResources aborts since pod-level config is empty, cfg: %v", podCfg) @@ -218,13 +218,13 @@ func (m *CgroupResourcesReconcile) calculatePodResources(pod *corev1.Pod, parent // Mem QoS // resources statically use configured values - if podCfg.MemoryQoS != nil { - summary.memoryWmarkRatio = podCfg.MemoryQoS.WmarkRatio - summary.memoryWmarkScaleFactor = podCfg.MemoryQoS.WmarkScalePermill - summary.memoryWmarkMinAdj = podCfg.MemoryQoS.WmarkMinAdj - summary.memoryUsePriorityOom = podCfg.MemoryQoS.PriorityEnable - summary.memoryPriority = podCfg.MemoryQoS.Priority - summary.memoryOomKillGroup = podCfg.MemoryQoS.OomKillGroup + if podCfg.MemoryQOS != nil { + summary.memoryWmarkRatio = podCfg.MemoryQOS.WmarkRatio + summary.memoryWmarkScaleFactor = podCfg.MemoryQOS.WmarkScalePermill + summary.memoryWmarkMinAdj = podCfg.MemoryQOS.WmarkMinAdj + summary.memoryUsePriorityOom = podCfg.MemoryQOS.PriorityEnable + summary.memoryPriority = podCfg.MemoryQOS.Priority + summary.memoryOomKillGroup = podCfg.MemoryQOS.OomKillGroup // resources calculated with pod spec var memRequest int64 // memory.min, memory.low: just sum all containers' memory requests; regard as no memory protection when any @@ -235,12 +235,12 @@ func (m *CgroupResourcesReconcile) calculatePodResources(pod *corev1.Pod, parent } else { memRequest = util.GetPodBEMemoryByteRequestIgnoreUnlimited(pod) } - if podCfg.MemoryQoS.MinLimitPercent != nil { + if podCfg.MemoryQOS.MinLimitPercent != nil { // assert no overflow for request < 1PiB - summary.memoryMin = pointer.Int64Ptr(memRequest * (*podCfg.MemoryQoS.MinLimitPercent) / 100) + summary.memoryMin = pointer.Int64Ptr(memRequest * (*podCfg.MemoryQOS.MinLimitPercent) / 100) } - if podCfg.MemoryQoS.LowLimitPercent != nil { - summary.memoryLow = pointer.Int64Ptr(memRequest * (*podCfg.MemoryQoS.LowLimitPercent) / 100) + if podCfg.MemoryQOS.LowLimitPercent != nil { + summary.memoryLow = pointer.Int64Ptr(memRequest * (*podCfg.MemoryQOS.LowLimitPercent) / 100) } // values improved: memory.low is no less than memory.min if summary.memoryMin != nil && summary.memoryLow != nil && *summary.memoryLow > 0 && @@ -255,7 +255,7 @@ func (m *CgroupResourcesReconcile) calculatePodResources(pod *corev1.Pod, parent } func (m *CgroupResourcesReconcile) calculateContainerResources(container *corev1.Container, pod *corev1.Pod, - node *corev1.Node, parentDir string, podCfg *slov1alpha1.ResourceQoS) []MergeableResourceUpdater { + node *corev1.Node, parentDir string, podCfg *slov1alpha1.ResourceQOS) []MergeableResourceUpdater { // double-check qos config is not nil if podCfg == nil { klog.V(5).Infof("calculateContainerResources aborts since pod-level config is empty, cfg: %v", podCfg) @@ -265,13 +265,13 @@ func (m *CgroupResourcesReconcile) calculateContainerResources(container *corev1 // Mem QoS // resources statically use configured values - if podCfg.MemoryQoS != nil { - summary.memoryWmarkRatio = podCfg.MemoryQoS.WmarkRatio - summary.memoryWmarkScaleFactor = podCfg.MemoryQoS.WmarkScalePermill - summary.memoryWmarkMinAdj = podCfg.MemoryQoS.WmarkMinAdj - summary.memoryUsePriorityOom = podCfg.MemoryQoS.PriorityEnable - summary.memoryPriority = podCfg.MemoryQoS.Priority - summary.memoryOomKillGroup = podCfg.MemoryQoS.OomKillGroup + if podCfg.MemoryQOS != nil { + summary.memoryWmarkRatio = podCfg.MemoryQOS.WmarkRatio + summary.memoryWmarkScaleFactor = podCfg.MemoryQOS.WmarkScalePermill + summary.memoryWmarkMinAdj = podCfg.MemoryQOS.WmarkMinAdj + summary.memoryUsePriorityOom = podCfg.MemoryQOS.PriorityEnable + summary.memoryPriority = podCfg.MemoryQOS.Priority + summary.memoryOomKillGroup = podCfg.MemoryQOS.OomKillGroup // resources calculated with container spec var memRequest int64 var memLimit int64 @@ -287,22 +287,22 @@ func (m *CgroupResourcesReconcile) calculateContainerResources(container *corev1 memRequest = 0 } // memory.min, memory.low: if container's memory request is not set, just consider it as zero - if podCfg.MemoryQoS.MinLimitPercent != nil { - summary.memoryMin = pointer.Int64Ptr(memRequest * (*podCfg.MemoryQoS.MinLimitPercent) / 100) + if podCfg.MemoryQOS.MinLimitPercent != nil { + summary.memoryMin = pointer.Int64Ptr(memRequest * (*podCfg.MemoryQOS.MinLimitPercent) / 100) } - if podCfg.MemoryQoS.LowLimitPercent != nil { - summary.memoryLow = pointer.Int64Ptr(memRequest * (*podCfg.MemoryQoS.LowLimitPercent) / 100) + if podCfg.MemoryQOS.LowLimitPercent != nil { + summary.memoryLow = pointer.Int64Ptr(memRequest * (*podCfg.MemoryQOS.LowLimitPercent) / 100) } // memory.high: if container's memory throttling factor is set as zero, disable memory.high by set to maximal; // else if factor is set while container's limit not set, set memory.high with node memory allocatable - if podCfg.MemoryQoS.ThrottlingPercent != nil { - if *podCfg.MemoryQoS.ThrottlingPercent == 0 { // reset to system default if set 0 + if podCfg.MemoryQOS.ThrottlingPercent != nil { + if *podCfg.MemoryQOS.ThrottlingPercent == 0 { // reset to system default if set 0 summary.memoryHigh = pointer.Int64Ptr(math.MaxInt64) // writing MaxInt64 is equal to write "max" } else if memLimit > 0 { - summary.memoryHigh = pointer.Int64Ptr(memLimit * (*podCfg.MemoryQoS.ThrottlingPercent) / 100) + summary.memoryHigh = pointer.Int64Ptr(memLimit * (*podCfg.MemoryQOS.ThrottlingPercent) / 100) } else { nodeLimit := node.Status.Allocatable.Memory().Value() - summary.memoryHigh = pointer.Int64Ptr(nodeLimit * (*podCfg.MemoryQoS.ThrottlingPercent) / 100) + summary.memoryHigh = pointer.Int64Ptr(nodeLimit * (*podCfg.MemoryQOS.ThrottlingPercent) / 100) } } // values improved: memory.low is no less than memory.min @@ -324,29 +324,29 @@ func (m *CgroupResourcesReconcile) calculateContainerResources(container *corev1 return makeCgroupResources(ContainerOwnerRef(pod.Namespace, pod.Name, container.Name), parentDir, summary) } -// getMergedPodResourceQoS returns a merged ResourceQoS for the pod (i.e. a pod-level qos config). +// getMergedPodResourceQoS returns a merged ResourceQOS for the pod (i.e. a pod-level qos config). // 1. merge pod-level cfg with node-level cfg if pod annotation of advanced qos config exists; // 2. calculates and finally returns the pod-level cfg with each feature cfg (e.g. pod-level memory qos config). -func (m *CgroupResourcesReconcile) getMergedPodResourceQoS(pod *corev1.Pod, cfg *slov1alpha1.ResourceQoS) (*slov1alpha1.ResourceQoS, error) { - // deep-copy node config into pod config; assert cfg == NoneResourceQoS when node disables +func (m *CgroupResourcesReconcile) getMergedPodResourceQoS(pod *corev1.Pod, cfg *slov1alpha1.ResourceQOS) (*slov1alpha1.ResourceQOS, error) { + // deep-copy node config into pod config; assert cfg == NoneResourceQOS when node disables mergedCfg := cfg.DeepCopy() // update with memory qos config m.mergePodResourceQoSForMemoryQoS(pod, mergedCfg) - klog.V(5).Infof("get merged pod ResourceQoS %v for pod %s", util.DumpJSON(mergedCfg), util.GetPodKey(pod)) + klog.V(5).Infof("get merged pod ResourceQOS %v for pod %s", util.DumpJSON(mergedCfg), util.GetPodKey(pod)) return mergedCfg, nil } // mergePodResourceQoSForMemoryQoS merges pod-level memory qos config with node-level resource qos config // config overwrite: pod-level config > pod policy template > node-level config -func (m *CgroupResourcesReconcile) mergePodResourceQoSForMemoryQoS(pod *corev1.Pod, cfg *slov1alpha1.ResourceQoS) { +func (m *CgroupResourcesReconcile) mergePodResourceQoSForMemoryQoS(pod *corev1.Pod, cfg *slov1alpha1.ResourceQOS) { // get the pod-level config and determine if the pod is allowed // TODO: support namespaced switch - if cfg.MemoryQoS == nil { - cfg.MemoryQoS = &slov1alpha1.MemoryQoSCfg{} + if cfg.MemoryQOS == nil { + cfg.MemoryQOS = &slov1alpha1.MemoryQOSCfg{} } - policy := slov1alpha1.PodMemoryQoSPolicyDefault + policy := slov1alpha1.PodMemoryQOSPolicyDefault // get pod-level config podCfg, err := apiext.GetPodMemoryQoSConfig(pod) @@ -360,12 +360,12 @@ func (m *CgroupResourcesReconcile) mergePodResourceQoSForMemoryQoS(pod *corev1.P klog.V(5).Infof("memory qos podPolicy=%s for pod %s", policy, util.GetPodKey(pod)) // if policy is not default, replace memory qos config with the policy template - if policy == slov1alpha1.PodMemoryQoSPolicyNone { // fully disable memory qos for policy=None - cfg.MemoryQoS.MemoryQoS = *util.NoneMemoryQoS() - cfg.MemoryQoS.Enable = pointer.BoolPtr(false) + if policy == slov1alpha1.PodMemoryQOSPolicyNone { // fully disable memory qos for policy=None + cfg.MemoryQOS.MemoryQOS = *util.NoneMemoryQOS() + cfg.MemoryQOS.Enable = pointer.BoolPtr(false) return - } else if policy == slov1alpha1.PodMemoryQoSPolicyAuto { // qos=None would be set with kubeQoS for policy=Auto - cfg.MemoryQoS.MemoryQoS = getPodResourceQoSByQoSClass(pod, util.DefaultResourceQoSStrategy(), m.resmanager.config).MemoryQoS.MemoryQoS + } else if policy == slov1alpha1.PodMemoryQOSPolicyAuto { // qos=None would be set with kubeQoS for policy=Auto + cfg.MemoryQOS.MemoryQOS = getPodResourceQoSByQoSClass(pod, util.DefaultResourceQOSStrategy(), m.resmanager.config).MemoryQOS.MemoryQOS } // no need to merge config if pod-level config is nil @@ -373,18 +373,18 @@ func (m *CgroupResourcesReconcile) mergePodResourceQoSForMemoryQoS(pod *corev1.P return } // otherwise detailed pod-level config is specified, merge with node-level config for the pod - merged, err := util.MergeCfg(&cfg.MemoryQoS.MemoryQoS, &podCfg.MemoryQoS) // node config has been deep-copied + merged, err := util.MergeCfg(&cfg.MemoryQOS.MemoryQOS, &podCfg.MemoryQOS) // node config has been deep-copied if err != nil { // not change memory qos config if merge error klog.Errorf("failed to merge memory qos config with node config, pod %s, err: %s", util.GetPodKey(pod), err) return } - cfg.MemoryQoS.MemoryQoS = *merged.(*slov1alpha1.MemoryQoS) - klog.V(6).Infof("get merged memory qos %v", util.DumpJSON(cfg.MemoryQoS)) + cfg.MemoryQOS.MemoryQOS = *merged.(*slov1alpha1.MemoryQOS) + klog.V(6).Infof("get merged memory qos %v", util.DumpJSON(cfg.MemoryQOS)) } // updateCgroupSummaryForQoS updates qos cgroup summary by pod to summarize qos-level cgroup according to belonging pods -func updateCgroupSummaryForQoS(summary *cgroupResourceSummary, pod *corev1.Pod, podCfg *slov1alpha1.ResourceQoS) { +func updateCgroupSummaryForQoS(summary *cgroupResourceSummary, pod *corev1.Pod, podCfg *slov1alpha1.ResourceQOS) { // Memory QoS // `memory.min` for qos := sum(requests of pod with the qos * minLimitPercent); if factor is nil, set kernel default // `memory.low` for qos := sum(requests of pod with the qos * lowLimitPercent); if factor is nil, set kernel default @@ -396,18 +396,18 @@ func updateCgroupSummaryForQoS(summary *cgroupResourceSummary, pod *corev1.Pod, } else { memRequest = util.GetPodBEMemoryByteRequestIgnoreUnlimited(pod) } - if podCfg.MemoryQoS.MinLimitPercent != nil { + if podCfg.MemoryQOS.MinLimitPercent != nil { if summary.memoryMin == nil { summary.memoryMin = pointer.Int64Ptr(0) } // assert no overflow for req < 1PiB - *summary.memoryMin += memRequest * (*podCfg.MemoryQoS.MinLimitPercent) / 100 + *summary.memoryMin += memRequest * (*podCfg.MemoryQOS.MinLimitPercent) / 100 } - if podCfg.MemoryQoS.LowLimitPercent != nil { + if podCfg.MemoryQOS.LowLimitPercent != nil { if summary.memoryLow == nil { summary.memoryLow = pointer.Int64Ptr(0) } - *summary.memoryLow += memRequest * (*podCfg.MemoryQoS.LowLimitPercent) / 100 + *summary.memoryLow += memRequest * (*podCfg.MemoryQOS.LowLimitPercent) / 100 } } @@ -467,6 +467,8 @@ func makeCgroupResourcesForAnolis(owner *OwnerRef, parentDir string, summary *cg var resources []MergeableResourceUpdater if !system.HostSystemInfo.IsAnolisOS { + klog.V(5).Infof("ignored cgroup resources which required non Anolis OS, owner: %v, parentDir: %v", + owner, parentDir) return nil } @@ -517,42 +519,42 @@ func makeCgroupResourcesForAnolis(owner *OwnerRef, parentDir string, summary *cg // getKubeQoSResourceQoSByQoSClass gets pod config by mapping kube qos into koordinator qos. // https://koordinator.sh/docs/core-concepts/qos/#koordinator-qos-vs-kubernetes-qos -func getKubeQoSResourceQoSByQoSClass(qosClass corev1.PodQOSClass, strategy *slov1alpha1.ResourceQoSStrategy, - config *Config) *slov1alpha1.ResourceQoS { +func getKubeQoSResourceQoSByQoSClass(qosClass corev1.PodQOSClass, strategy *slov1alpha1.ResourceQOSStrategy, + config *Config) *slov1alpha1.ResourceQOS { // NOTE: only used for static qos resource calculation here, and it may be incorrect mapping for dynamic qos // resource, e.g. qos class of a LS pod can be corev1.PodQOSGuaranteed if strategy == nil { return nil } - var resourceQoS *slov1alpha1.ResourceQoS + var resourceQoS *slov1alpha1.ResourceQOS switch qosClass { case corev1.PodQOSGuaranteed: - resourceQoS = strategy.LSR + resourceQoS = strategy.LSRClass case corev1.PodQOSBurstable: - resourceQoS = strategy.LS + resourceQoS = strategy.LSClass case corev1.PodQOSBestEffort: - resourceQoS = strategy.BE + resourceQoS = strategy.BEClass } return resourceQoS } -func getPodResourceQoSByQoSClass(pod *corev1.Pod, strategy *slov1alpha1.ResourceQoSStrategy, config *Config) *slov1alpha1.ResourceQoS { +func getPodResourceQoSByQoSClass(pod *corev1.Pod, strategy *slov1alpha1.ResourceQOSStrategy, config *Config) *slov1alpha1.ResourceQOS { if strategy == nil { return nil } - var resourceQoS *slov1alpha1.ResourceQoS + var resourceQoS *slov1alpha1.ResourceQOS podQoS := apiext.GetPodQoSClass(pod) switch podQoS { case apiext.QoSLSR: - resourceQoS = strategy.LSR + resourceQoS = strategy.LSRClass case apiext.QoSLS: - resourceQoS = strategy.LS + resourceQoS = strategy.LSClass case apiext.QoSBE: - resourceQoS = strategy.BE + resourceQoS = strategy.BEClass default: // qos=None pods uses config mapped from kubeQoS resourceQoS = getKubeQoSResourceQoSByQoSClass(util.GetKubeQosClass(pod), strategy, config) - klog.V(6).Infof("get pod ResourceQoS according to kubeQoS for QoS=None pods, pod %s, "+ + klog.V(6).Infof("get pod ResourceQOS according to kubeQoS for QoS=None pods, pod %s, "+ "resourceQoS %v", util.GetPodKey(pod), util.DumpJSON(resourceQoS)) } return resourceQoS diff --git a/pkg/koordlet/resmanager/cgroup_reconcile_test.go b/pkg/koordlet/resmanager/cgroup_reconcile_test.go index 0fdc35620..d27d43e10 100644 --- a/pkg/koordlet/resmanager/cgroup_reconcile_test.go +++ b/pkg/koordlet/resmanager/cgroup_reconcile_test.go @@ -55,11 +55,11 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, } - testingQoSStrategyBE := &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + testingQOSStrategyBE := &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -72,10 +72,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -88,10 +88,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -105,11 +105,11 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, } - testingQoSStrategyLS := &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + testingQOSStrategyLS := &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(100), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -122,10 +122,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(100), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -138,10 +138,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -155,11 +155,11 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, } - testingQoSStrategyLSR := &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + testingQOSStrategyLSR := &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(100), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -172,10 +172,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -188,10 +188,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -205,11 +205,11 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, } - testingQoSStrategyNone := &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + testingQOSStrategyNone := &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -222,10 +222,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -238,10 +238,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -259,46 +259,46 @@ func Test_calculateAndUpdateResources(t *testing.T) { testingNonRunningPod.Pod.Status.Phase = corev1.PodSucceeded type args struct { name string - qosStrategy *slov1alpha1.ResourceQoSStrategy + qosStrategy *slov1alpha1.ResourceQOSStrategy podMetas []*statesinformer.PodMeta - expect *slov1alpha1.ResourceQoSStrategy + expect *slov1alpha1.ResourceQOSStrategy } tests := []args{ { name: "empty config with no pod", - qosStrategy: defaultQoSStrategy(), - expect: defaultQoSStrategy(), + qosStrategy: defaultQOSStrategy(), + expect: defaultQOSStrategy(), }, { name: "valid config with no pod", - qosStrategy: newValidQoSStrategy(), - expect: mergeWithDefaultQoSStrategy(newValidQoSStrategy()), // memory.wmark_xxx use default + qosStrategy: newValidQOSStrategy(), + expect: mergeWithDefaultQOSStrategy(newValidQOSStrategy()), // memory.wmark_xxx use default }, { name: "mixed config with no pod", - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ WmarkRatio: pointer.Int64Ptr(101), WmarkMinAdj: pointer.Int64Ptr(-51), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ Priority: pointer.Int64Ptr(6), PriorityEnable: pointer.Int64Ptr(1), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ WmarkRatio: pointer.Int64Ptr(-1), WmarkScalePermill: pointer.Int64Ptr(20), WmarkMinAdj: pointer.Int64Ptr(53), @@ -308,11 +308,11 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - expect: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + expect: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -325,10 +325,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -341,10 +341,10 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, }, }, - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -361,51 +361,51 @@ func Test_calculateAndUpdateResources(t *testing.T) { }, { name: "calculate qos resources from a pod", - qosStrategy: testingQoSStrategyBE, + qosStrategy: testingQOSStrategyBE, podMetas: []*statesinformer.PodMeta{ createPod(corev1.PodQOSBestEffort, apiext.QoSBE), }, - expect: mergeWithDefaultQoSStrategy(testingQoSStrategyBE), + expect: mergeWithDefaultQOSStrategy(testingQOSStrategyBE), }, { name: "calculate qos resources from a pod 1", - qosStrategy: testingQoSStrategyLS, + qosStrategy: testingQOSStrategyLS, podMetas: []*statesinformer.PodMeta{ createPod(corev1.PodQOSBurstable, apiext.QoSLS), }, - expect: mergeWithDefaultQoSStrategy(testingQoSStrategyLS), + expect: mergeWithDefaultQOSStrategy(testingQOSStrategyLS), }, { name: "calculate qos resources from a pod 2", - qosStrategy: testingQoSStrategyLSR, + qosStrategy: testingQOSStrategyLSR, podMetas: []*statesinformer.PodMeta{ createPod(corev1.PodQOSGuaranteed, apiext.QoSLSR), }, - expect: mergeWithDefaultQoSStrategy(testingQoSStrategyLSR), + expect: mergeWithDefaultQOSStrategy(testingQOSStrategyLSR), }, { name: "node disabled", - qosStrategy: testingQoSStrategyNone, + qosStrategy: testingQOSStrategyNone, podMetas: []*statesinformer.PodMeta{ - createPodWithMemoryQoS(corev1.PodQOSBestEffort, apiext.QoSBE, &slov1alpha1.PodMemoryQoSConfig{Policy: slov1alpha1.PodMemoryQoSPolicyDefault}), + createPodWithMemoryQOS(corev1.PodQOSBestEffort, apiext.QoSBE, &slov1alpha1.PodMemoryQOSConfig{Policy: slov1alpha1.PodMemoryQOSPolicyDefault}), }, - expect: mergeWithDefaultQoSStrategy(testingQoSStrategyNone), + expect: mergeWithDefaultQOSStrategy(testingQOSStrategyNone), }, { name: "pod enabled while node disabled", - qosStrategy: testingQoSStrategyBE, + qosStrategy: testingQOSStrategyBE, podMetas: []*statesinformer.PodMeta{ - createPodWithMemoryQoS(corev1.PodQOSBestEffort, apiext.QoSBE, &slov1alpha1.PodMemoryQoSConfig{Policy: slov1alpha1.PodMemoryQoSPolicyAuto}), + createPodWithMemoryQOS(corev1.PodQOSBestEffort, apiext.QoSBE, &slov1alpha1.PodMemoryQOSConfig{Policy: slov1alpha1.PodMemoryQOSPolicyAuto}), }, - expect: mergeWithDefaultQoSStrategy(testingQoSStrategyBE), + expect: mergeWithDefaultQOSStrategy(testingQOSStrategyBE), }, { name: "ignore non-running pod", - qosStrategy: testingQoSStrategyBE, + qosStrategy: testingQOSStrategyBE, podMetas: []*statesinformer.PodMeta{ testingNonRunningPod, }, - expect: defaultQoSStrategy(), + expect: defaultQOSStrategy(), }, } for _, tt := range tests { @@ -426,11 +426,11 @@ func Test_calculateAndUpdateResources(t *testing.T) { helper := system.NewFileTestUtil(t) - initQoSStrategy := defaultQoSStrategy() - initQoSCgroupFile(initQoSStrategy, helper) + initQOSStrategy := defaultQOSStrategy() + initQOSCgroupFile(initQOSStrategy, helper) - reconciler.calculateAndUpdateResources(createNodeSLOWithQoSStrategy(tt.qosStrategy)) - got := gotQoSStrategyFromFile() + reconciler.calculateAndUpdateResources(createNodeSLOWithQOSStrategy(tt.qosStrategy)) + got := gotQOSStrategyFromFile() assert.Equal(t, tt.expect, got) }) } @@ -441,9 +441,9 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { podParentDirLS := util.GetPodCgroupDirWithKube(testingPodLS.CgroupDir) containerDirLS, _ := util.GetContainerCgroupPathWithKube(testingPodLS.CgroupDir, &testingPodLS.Pod.Status.ContainerStatuses[0]) containerDirLS1, _ := util.GetContainerCgroupPathWithKube(testingPodLS.CgroupDir, &testingPodLS.Pod.Status.ContainerStatuses[1]) - testingPodBEWithMemQoS := createPodWithMemoryQoS(corev1.PodQOSBestEffort, apiext.QoSBE, &slov1alpha1.PodMemoryQoSConfig{ - Policy: slov1alpha1.PodMemoryQoSPolicyAuto, - MemoryQoS: slov1alpha1.MemoryQoS{ + testingPodBEWithMemQOS := createPodWithMemoryQOS(corev1.PodQOSBestEffort, apiext.QoSBE, &slov1alpha1.PodMemoryQOSConfig{ + Policy: slov1alpha1.PodMemoryQOSPolicyAuto, + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(100), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(80), @@ -452,9 +452,9 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { WmarkMinAdj: pointer.Int64Ptr(50), }, }) - testingPodBEWithMemQoS1 := createPodWithMemoryQoS(corev1.PodQOSBestEffort, apiext.QoSBE, &slov1alpha1.PodMemoryQoSConfig{ - Policy: slov1alpha1.PodMemoryQoSPolicyAuto, - MemoryQoS: slov1alpha1.MemoryQoS{ + testingPodBEWithMemQoS1 := createPodWithMemoryQOS(corev1.PodQOSBestEffort, apiext.QoSBE, &slov1alpha1.PodMemoryQOSConfig{ + Policy: slov1alpha1.PodMemoryQOSPolicyAuto, + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(50), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(40), @@ -463,14 +463,14 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { WmarkMinAdj: pointer.Int64Ptr(50), }, }) - podParentDirBE := util.GetPodCgroupDirWithKube(testingPodBEWithMemQoS.CgroupDir) - containerDirBE, _ := util.GetContainerCgroupPathWithKube(testingPodBEWithMemQoS.CgroupDir, &testingPodBEWithMemQoS.Pod.Status.ContainerStatuses[0]) - containerDirBE1, _ := util.GetContainerCgroupPathWithKube(testingPodBEWithMemQoS.CgroupDir, &testingPodBEWithMemQoS.Pod.Status.ContainerStatuses[1]) + podParentDirBE := util.GetPodCgroupDirWithKube(testingPodBEWithMemQOS.CgroupDir) + containerDirBE, _ := util.GetContainerCgroupPathWithKube(testingPodBEWithMemQOS.CgroupDir, &testingPodBEWithMemQOS.Pod.Status.ContainerStatuses[0]) + containerDirBE1, _ := util.GetContainerCgroupPathWithKube(testingPodBEWithMemQOS.CgroupDir, &testingPodBEWithMemQOS.Pod.Status.ContainerStatuses[1]) type fields struct { resmanager *resmanager } type args struct { - nodeCfg *slov1alpha1.ResourceQoSStrategy + nodeCfg *slov1alpha1.ResourceQOSStrategy node *corev1.Node podMetas []*statesinformer.PodMeta } @@ -491,10 +491,10 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { name: "not panic when no pods exists with a valid resourceQoS config", fields: fields{resmanager: &resmanager{config: NewDefaultConfig()}}, args: args{ - nodeCfg: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{}, - LS: &slov1alpha1.ResourceQoS{}, - BE: &slov1alpha1.ResourceQoS{}, + nodeCfg: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{}, + LSClass: &slov1alpha1.ResourceQOS{}, + BEClass: &slov1alpha1.ResourceQOS{}, }, }, // no resourceUpdater generated @@ -503,10 +503,10 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { name: "config is empty", fields: fields{resmanager: &resmanager{config: NewDefaultConfig()}}, args: args{ - nodeCfg: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{}, - LS: &slov1alpha1.ResourceQoS{}, - BE: &slov1alpha1.ResourceQoS{}, + nodeCfg: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{}, + LSClass: &slov1alpha1.ResourceQOS{}, + BEClass: &slov1alpha1.ResourceQOS{}, }, podMetas: []*statesinformer.PodMeta{ { @@ -525,10 +525,10 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { name: "single pod using node-level config", fields: fields{resmanager: &resmanager{config: NewDefaultConfig()}}, args: args{ - nodeCfg: &slov1alpha1.ResourceQoSStrategy{ - LSR: defaultQoSStrategy().LSR, - LS: defaultQoSStrategy().LS, - BE: defaultQoSStrategy().BE, + nodeCfg: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: defaultQOSStrategy().LSRClass, + LSClass: defaultQOSStrategy().LSClass, + BEClass: defaultQOSStrategy().BEClass, }, podMetas: []*statesinformer.PodMeta{ testingPodLS, @@ -584,10 +584,10 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { name: "single pod using pod-level config", fields: fields{resmanager: &resmanager{config: NewDefaultConfig()}}, args: args{ - nodeCfg: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{}, - LS: &slov1alpha1.ResourceQoS{}, - BE: &slov1alpha1.ResourceQoS{}, + nodeCfg: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{}, + LSClass: &slov1alpha1.ResourceQOS{}, + BEClass: &slov1alpha1.ResourceQOS{}, }, node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -601,7 +601,7 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { }, }, podMetas: []*statesinformer.PodMeta{ - testingPodBEWithMemQoS, + testingPodBEWithMemQOS, }, }, want: []MergeableResourceUpdater{ @@ -611,46 +611,46 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { NewMergeableCgroupResourceUpdater(GroupOwnerRef(string(corev1.PodQOSBestEffort)), util.GetKubeQosRelativePath(corev1.PodQOSBestEffort), system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), }, want1: []MergeableResourceUpdater{ - NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemOomGroup, "0"), }, want2: []MergeableResourceUpdater{ - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemMin, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes*80/100, 10), mergeFuncUpdateCgroupIfLarger), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemOomGroup, "0"), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemMin, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes*80/100, 10), mergeFuncUpdateCgroupIfLarger), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemOomGroup, "0"), }, }, { name: "multiple pods", fields: fields{resmanager: &resmanager{config: NewDefaultConfig()}}, args: args{ - nodeCfg: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{}, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + nodeCfg: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{}, + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(false), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -660,7 +660,7 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { }, }, }, - BE: &slov1alpha1.ResourceQoS{}, + BEClass: &slov1alpha1.ResourceQOS{}, }, node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -675,7 +675,7 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { }, podMetas: []*statesinformer.PodMeta{ testingPodLS, - testingPodBEWithMemQoS, + testingPodBEWithMemQOS, }, }, want: []MergeableResourceUpdater{ @@ -692,14 +692,14 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodLS.Pod.Namespace, testingPodLS.Pod.Name), podParentDirLS, system.MemWmarkRatio, "0"), NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodLS.Pod.Namespace, testingPodLS.Pod.Name), podParentDirLS, system.MemWmarkScaleFactor, "50"), NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodLS.Pod.Namespace, testingPodLS.Pod.Name), podParentDirLS, system.MemWmarkMinAdj, "0"), - NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemOomGroup, "0"), }, want2: []MergeableResourceUpdater{ NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodLS.Pod.Namespace, testingPodLS.Pod.Name, "test"), containerDirLS, system.MemMin, "0", mergeFuncUpdateCgroupIfLarger), @@ -714,34 +714,34 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodLS.Pod.Namespace, testingPodLS.Pod.Name, "main"), containerDirLS1, system.MemWmarkRatio, "0"), NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodLS.Pod.Namespace, testingPodLS.Pod.Name, "main"), containerDirLS1, system.MemWmarkScaleFactor, "50"), NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodLS.Pod.Namespace, testingPodLS.Pod.Name, "main"), containerDirLS1, system.MemWmarkMinAdj, "0"), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemMin, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes*80/100, 10), mergeFuncUpdateCgroupIfLarger), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemOomGroup, "0"), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemMin, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes*80/100, 10), mergeFuncUpdateCgroupIfLarger), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes, 10), mergeFuncUpdateCgroupIfLarger), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemOomGroup, "0"), }, }, { name: "single pod with memory.high is no less than memory.min", fields: fields{resmanager: &resmanager{config: NewDefaultConfig()}}, args: args{ - nodeCfg: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{}, - LS: &slov1alpha1.ResourceQoS{}, - BE: &slov1alpha1.ResourceQoS{}, + nodeCfg: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{}, + LSClass: &slov1alpha1.ResourceQOS{}, + BEClass: &slov1alpha1.ResourceQOS{}, }, node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -765,34 +765,34 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { NewMergeableCgroupResourceUpdater(GroupOwnerRef(string(corev1.PodQOSBestEffort)), util.GetKubeQosRelativePath(corev1.PodQOSBestEffort), system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), }, want1: []MergeableResourceUpdater{ - NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes*50/100, 10), mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name), podParentDirBE, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes*50/100, 10), mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(PodOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name), podParentDirBE, system.MemOomGroup, "0"), }, want2: []MergeableResourceUpdater{ - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemMin, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes*40/100, 10), mergeFuncUpdateCgroupIfLarger), // node allocatable * throttling factor - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "test"), containerDirBE, system.MemOomGroup, "0"), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes*50/100, 10), mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), - NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes*50/100, 10), mergeFuncUpdateCgroupIfLarger), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkRatio, "95"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkScaleFactor, "20"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemWmarkMinAdj, "50"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemPriority, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemUsePriorityOom, "0"), - NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQoS.Pod.Namespace, testingPodBEWithMemQoS.Pod.Name, "main"), containerDirBE1, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemMin, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes*40/100, 10), mergeFuncUpdateCgroupIfLarger), // node allocatable * throttling factor + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "test"), containerDirBE, system.MemOomGroup, "0"), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemMin, strconv.FormatInt(testingPodMemRequestLimitBytes*50/100, 10), mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemLow, "0", mergeFuncUpdateCgroupIfLarger), + NewMergeableCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemHigh, strconv.FormatInt(testingPodMemRequestLimitBytes*50/100, 10), mergeFuncUpdateCgroupIfLarger), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkRatio, "95"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkScaleFactor, "20"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemWmarkMinAdj, "50"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemPriority, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemUsePriorityOom, "0"), + NewCommonCgroupResourceUpdater(ContainerOwnerRef(testingPodBEWithMemQOS.Pod.Namespace, testingPodBEWithMemQOS.Pod.Name, "main"), containerDirBE1, system.MemOomGroup, "0"), }, }, } @@ -815,26 +815,26 @@ func TestCgroupResourceReconcile_calculateResources(t *testing.T) { } func TestCgroupResourcesReconcile_getMergedPodResourceQoS(t *testing.T) { - testingNodeNoneResourceQoS := util.NoneResourceQoSStrategy().BE - testingMemoryQoSEnableResourceQoS := util.DefaultResourceQoSStrategy().BE // qos enable - testingMemoryQoSEnableResourceQoS.MemoryQoS.Enable = pointer.BoolPtr(true) - testingMemoryQoSNoneResourceQoS := util.NoneResourceQoSStrategy().BE // qos disable - testingMemoryQoSNoneResourceQoS.MemoryQoS = util.NoneResourceQoSStrategy().BE.MemoryQoS - testingMemoryQoSNoneResourceQoS1 := util.DefaultResourceQoSStrategy().BE // qos partially disable - testingMemoryQoSNoneResourceQoS1.MemoryQoS = util.NoneResourceQoSStrategy().BE.MemoryQoS - testingMemoryQoSAutoResourceQoS := util.NoneResourceQoSStrategy().BE - testingMemoryQoSAutoResourceQoS.MemoryQoS.MemoryQoS = *util.DefaultMemoryQoS(apiext.QoSBE) - testingMemoryQoSAutoResourceQoS1 := util.DefaultResourceQoSStrategy().BE - testingMemoryQoSAutoResourceQoS1.MemoryQoS.ThrottlingPercent = pointer.Int64Ptr(90) - testingMemoryQoSAutoResourceQoS2 := &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ - MemoryQoS: *util.DefaultMemoryQoS(apiext.QoSBE), + testingNodeNoneResourceQoS := util.NoneResourceQOSStrategy().BEClass + testingMemoryQoSEnableResourceQoS := util.DefaultResourceQOSStrategy().BEClass // qos enable + testingMemoryQoSEnableResourceQoS.MemoryQOS.Enable = pointer.BoolPtr(true) + testingMemoryQoSNoneResourceQoS := util.NoneResourceQOSStrategy().BEClass // qos disable + testingMemoryQoSNoneResourceQoS.MemoryQOS = util.NoneResourceQOSStrategy().BEClass.MemoryQOS + testingMemoryQoSNoneResourceQoS1 := util.DefaultResourceQOSStrategy().BEClass // qos partially disable + testingMemoryQoSNoneResourceQoS1.MemoryQOS = util.NoneResourceQOSStrategy().BEClass.MemoryQOS + testingMemoryQoSAutoResourceQoS := util.NoneResourceQOSStrategy().BEClass + testingMemoryQoSAutoResourceQoS.MemoryQOS.MemoryQOS = *util.DefaultMemoryQOS(apiext.QoSBE) + testingMemoryQoSAutoResourceQoS1 := util.DefaultResourceQOSStrategy().BEClass + testingMemoryQoSAutoResourceQoS1.MemoryQOS.ThrottlingPercent = pointer.Int64Ptr(90) + testingMemoryQoSAutoResourceQoS2 := &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ + MemoryQOS: *util.DefaultMemoryQOS(apiext.QoSBE), }, } - testingMemoryQoSAutoResourceQoS2.MemoryQoS.ThrottlingPercent = pointer.Int64Ptr(90) + testingMemoryQoSAutoResourceQoS2.MemoryQOS.ThrottlingPercent = pointer.Int64Ptr(90) type args struct { pod *corev1.Pod - cfg *slov1alpha1.ResourceQoS + cfg *slov1alpha1.ResourceQOS } type fields struct { resmanager *resmanager @@ -843,7 +843,7 @@ func TestCgroupResourcesReconcile_getMergedPodResourceQoS(t *testing.T) { name string fields fields args args - want *slov1alpha1.ResourceQoS + want *slov1alpha1.ResourceQOS wantErr bool }{ { @@ -866,9 +866,9 @@ func TestCgroupResourcesReconcile_getMergedPodResourceQoS(t *testing.T) { Phase: corev1.PodRunning, }, }, - cfg: defaultQoSStrategy().BE, + cfg: defaultQOSStrategy().BEClass, }, - want: defaultQoSStrategy().BE, + want: defaultQOSStrategy().BEClass, }, { name: "pod policy is None, use pod config", @@ -893,7 +893,7 @@ func TestCgroupResourcesReconcile_getMergedPodResourceQoS(t *testing.T) { Phase: corev1.PodRunning, }, }, - cfg: util.DefaultResourceQoSStrategy().BE, + cfg: util.DefaultResourceQOSStrategy().BEClass, }, want: testingMemoryQoSNoneResourceQoS1, }, @@ -947,7 +947,7 @@ func TestCgroupResourcesReconcile_getMergedPodResourceQoS(t *testing.T) { Phase: corev1.PodRunning, }, }, - cfg: util.DefaultResourceQoSStrategy().BE, + cfg: util.DefaultResourceQOSStrategy().BEClass, }, want: testingMemoryQoSAutoResourceQoS1, }, @@ -972,7 +972,7 @@ func TestCgroupResourcesReconcile_getMergedPodResourceQoS(t *testing.T) { Phase: corev1.PodRunning, }, }, - cfg: &slov1alpha1.ResourceQoS{}, + cfg: &slov1alpha1.ResourceQOS{}, }, want: testingMemoryQoSAutoResourceQoS2, }, @@ -1076,13 +1076,13 @@ func Test_makeCgroupResources(t *testing.T) { func Test_getPodResourceQoSByQoSClass(t *testing.T) { type args struct { pod *corev1.Pod - strategy *slov1alpha1.ResourceQoSStrategy + strategy *slov1alpha1.ResourceQOSStrategy config *Config } tests := []struct { name string args args - want *slov1alpha1.ResourceQoS + want *slov1alpha1.ResourceQOS }{ { name: "return nil", @@ -1093,28 +1093,28 @@ func Test_getPodResourceQoSByQoSClass(t *testing.T) { name: "get qos=LS config", args: args{ pod: createPod(corev1.PodQOSBurstable, apiext.QoSLS).Pod, - strategy: defaultQoSStrategy(), + strategy: defaultQOSStrategy(), config: NewDefaultConfig(), }, - want: defaultQoSStrategy().LS, + want: defaultQOSStrategy().LSClass, }, { name: "get qos=None kubeQoS=Burstable config", args: args{ pod: createPod(corev1.PodQOSBurstable, apiext.QoSNone).Pod, - strategy: defaultQoSStrategy(), + strategy: defaultQOSStrategy(), config: NewDefaultConfig(), }, - want: defaultQoSStrategy().LS, + want: defaultQOSStrategy().LSClass, }, { name: "get qos=None kubeQoS=Besteffort config", args: args{ pod: createPod(corev1.PodQOSBestEffort, apiext.QoSNone).Pod, - strategy: defaultQoSStrategy(), + strategy: defaultQOSStrategy(), config: NewDefaultConfig(), }, - want: defaultQoSStrategy().BE, + want: defaultQOSStrategy().BEClass, }, } for _, tt := range tests { @@ -1125,12 +1125,12 @@ func Test_getPodResourceQoSByQoSClass(t *testing.T) { } } -func defaultQoSStrategy() *slov1alpha1.ResourceQoSStrategy { - return &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ +func defaultQOSStrategy() *slov1alpha1.ResourceQOSStrategy { + return &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -1143,10 +1143,10 @@ func defaultQoSStrategy() *slov1alpha1.ResourceQoSStrategy { }, }, }, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -1159,10 +1159,10 @@ func defaultQoSStrategy() *slov1alpha1.ResourceQoSStrategy { }, }, }, - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -1178,12 +1178,12 @@ func defaultQoSStrategy() *slov1alpha1.ResourceQoSStrategy { } } -func newValidQoSStrategy() *slov1alpha1.ResourceQoSStrategy { - return &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ +func newValidQOSStrategy() *slov1alpha1.ResourceQOSStrategy { + return &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -1196,10 +1196,10 @@ func newValidQoSStrategy() *slov1alpha1.ResourceQoSStrategy { }, }, }, - LS: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -1212,10 +1212,10 @@ func newValidQoSStrategy() *slov1alpha1.ResourceQoSStrategy { }, }, }, - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -1231,17 +1231,17 @@ func newValidQoSStrategy() *slov1alpha1.ResourceQoSStrategy { } } -func mergeWithDefaultQoSStrategy(cfg *slov1alpha1.ResourceQoSStrategy) *slov1alpha1.ResourceQoSStrategy { - defaultCfg := defaultQoSStrategy() - cfg.LSR.MemoryQoS.WmarkRatio = defaultCfg.LSR.MemoryQoS.WmarkRatio - cfg.LSR.MemoryQoS.WmarkScalePermill = defaultCfg.LSR.MemoryQoS.WmarkScalePermill - cfg.LSR.MemoryQoS.WmarkMinAdj = defaultCfg.LSR.MemoryQoS.WmarkMinAdj - cfg.LS.MemoryQoS.WmarkRatio = defaultCfg.LS.MemoryQoS.WmarkRatio - cfg.LS.MemoryQoS.WmarkScalePermill = defaultCfg.LS.MemoryQoS.WmarkScalePermill - cfg.LS.MemoryQoS.WmarkMinAdj = defaultCfg.LS.MemoryQoS.WmarkMinAdj - cfg.BE.MemoryQoS.WmarkRatio = defaultCfg.BE.MemoryQoS.WmarkRatio - cfg.BE.MemoryQoS.WmarkScalePermill = defaultCfg.BE.MemoryQoS.WmarkScalePermill - cfg.BE.MemoryQoS.WmarkMinAdj = defaultCfg.BE.MemoryQoS.WmarkMinAdj +func mergeWithDefaultQOSStrategy(cfg *slov1alpha1.ResourceQOSStrategy) *slov1alpha1.ResourceQOSStrategy { + defaultCfg := defaultQOSStrategy() + cfg.LSRClass.MemoryQOS.WmarkRatio = defaultCfg.LSRClass.MemoryQOS.WmarkRatio + cfg.LSRClass.MemoryQOS.WmarkScalePermill = defaultCfg.LSRClass.MemoryQOS.WmarkScalePermill + cfg.LSRClass.MemoryQOS.WmarkMinAdj = defaultCfg.LSRClass.MemoryQOS.WmarkMinAdj + cfg.LSClass.MemoryQOS.WmarkRatio = defaultCfg.LSClass.MemoryQOS.WmarkRatio + cfg.LSClass.MemoryQOS.WmarkScalePermill = defaultCfg.LSClass.MemoryQOS.WmarkScalePermill + cfg.LSClass.MemoryQOS.WmarkMinAdj = defaultCfg.LSClass.MemoryQOS.WmarkMinAdj + cfg.BEClass.MemoryQOS.WmarkRatio = defaultCfg.BEClass.MemoryQOS.WmarkRatio + cfg.BEClass.MemoryQOS.WmarkScalePermill = defaultCfg.BEClass.MemoryQOS.WmarkScalePermill + cfg.BEClass.MemoryQOS.WmarkMinAdj = defaultCfg.BEClass.MemoryQOS.WmarkMinAdj return cfg } @@ -1311,8 +1311,8 @@ func createPod(kubeQosClass corev1.PodQOSClass, qosClass apiext.QoSClass) *state } } -func createPodWithMemoryQoS(kubeQosClass corev1.PodQOSClass, qosClass apiext.QoSClass, - memQoS *slov1alpha1.PodMemoryQoSConfig) *statesinformer.PodMeta { +func createPodWithMemoryQOS(kubeQosClass corev1.PodQOSClass, qosClass apiext.QoSClass, + memQoS *slov1alpha1.PodMemoryQOSConfig) *statesinformer.PodMeta { podMeta := createPod(kubeQosClass, qosClass) memQoSConfigBytes, _ := json.Marshal(memQoS) @@ -1323,10 +1323,10 @@ func createPodWithMemoryQoS(kubeQosClass corev1.PodQOSClass, qosClass apiext.QoS return podMeta } -func createNodeSLOWithQoSStrategy(qosStrategy *slov1alpha1.ResourceQoSStrategy) *slov1alpha1.NodeSLO { +func createNodeSLOWithQOSStrategy(qosStrategy *slov1alpha1.ResourceQOSStrategy) *slov1alpha1.NodeSLO { return &slov1alpha1.NodeSLO{ Spec: slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: qosStrategy, + ResourceQOSStrategy: qosStrategy, }, } } @@ -1351,62 +1351,62 @@ func assertCgroupResourceEqual(t *testing.T, expect, got []MergeableResourceUpda } } -func gotQoSStrategyFromFile() *slov1alpha1.ResourceQoSStrategy { - strategy := &slov1alpha1.ResourceQoSStrategy{} - strategy.LSR = readMemFromCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSGuaranteed)) - strategy.LS = readMemFromCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSBurstable)) - strategy.BE = readMemFromCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSBestEffort)) +func gotQOSStrategyFromFile() *slov1alpha1.ResourceQOSStrategy { + strategy := &slov1alpha1.ResourceQOSStrategy{} + strategy.LSRClass = readMemFromCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSGuaranteed)) + strategy.LSClass = readMemFromCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSBurstable)) + strategy.BEClass = readMemFromCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSBestEffort)) return strategy } -func initQoSCgroupFile(qos *slov1alpha1.ResourceQoSStrategy, helper *system.FileTestUtil) { - writeMemToCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSGuaranteed), qos.LSR, helper) - writeMemToCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSBurstable), qos.LS, helper) - writeMemToCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSBestEffort), qos.BE, helper) +func initQOSCgroupFile(qos *slov1alpha1.ResourceQOSStrategy, helper *system.FileTestUtil) { + writeMemToCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSGuaranteed), qos.LSRClass, helper) + writeMemToCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSBurstable), qos.LSClass, helper) + writeMemToCgroupFile(util.GetKubeQosRelativePath(corev1.PodQOSBestEffort), qos.BEClass, helper) } -func readMemFromCgroupFile(parentDir string) *slov1alpha1.ResourceQoS { - resourceQoS := &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{}, +func readMemFromCgroupFile(parentDir string) *slov1alpha1.ResourceQOS { + resourceQoS := &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{}, } // dynamic resources, calculate with pod request/limit=1GiB // testingPodMemRequestLimitBytes = 1073741824 minLimitPercent, _ := system.CgroupFileReadInt(parentDir, system.MemMin) if minLimitPercent != nil { - resourceQoS.MemoryQoS.MinLimitPercent = pointer.Int64Ptr((*minLimitPercent) * 100 / testingPodMemRequestLimitBytes) + resourceQoS.MemoryQOS.MinLimitPercent = pointer.Int64Ptr((*minLimitPercent) * 100 / testingPodMemRequestLimitBytes) } lowLimitPercent, _ := system.CgroupFileReadInt(parentDir, system.MemLow) if lowLimitPercent != nil { - resourceQoS.MemoryQoS.LowLimitPercent = pointer.Int64Ptr((*lowLimitPercent) * 100 / testingPodMemRequestLimitBytes) + resourceQoS.MemoryQOS.LowLimitPercent = pointer.Int64Ptr((*lowLimitPercent) * 100 / testingPodMemRequestLimitBytes) } throttlingPercent, _ := system.CgroupFileReadInt(parentDir, system.MemHigh) if throttlingPercent != nil { - resourceQoS.MemoryQoS.ThrottlingPercent = pointer.Int64Ptr(0) // assert test setting disabled + resourceQoS.MemoryQOS.ThrottlingPercent = pointer.Int64Ptr(0) // assert test setting disabled } // static resources - resourceQoS.MemoryQoS.WmarkRatio, _ = system.CgroupFileReadInt(parentDir, system.MemWmarkRatio) - resourceQoS.MemoryQoS.WmarkScalePermill, _ = system.CgroupFileReadInt(parentDir, system.MemWmarkScaleFactor) - resourceQoS.MemoryQoS.WmarkMinAdj, _ = system.CgroupFileReadInt(parentDir, system.MemWmarkMinAdj) - resourceQoS.MemoryQoS.PriorityEnable, _ = system.CgroupFileReadInt(parentDir, system.MemUsePriorityOom) - resourceQoS.MemoryQoS.Priority, _ = system.CgroupFileReadInt(parentDir, system.MemPriority) - resourceQoS.MemoryQoS.OomKillGroup, _ = system.CgroupFileReadInt(parentDir, system.MemOomGroup) + resourceQoS.MemoryQOS.WmarkRatio, _ = system.CgroupFileReadInt(parentDir, system.MemWmarkRatio) + resourceQoS.MemoryQOS.WmarkScalePermill, _ = system.CgroupFileReadInt(parentDir, system.MemWmarkScaleFactor) + resourceQoS.MemoryQOS.WmarkMinAdj, _ = system.CgroupFileReadInt(parentDir, system.MemWmarkMinAdj) + resourceQoS.MemoryQOS.PriorityEnable, _ = system.CgroupFileReadInt(parentDir, system.MemUsePriorityOom) + resourceQoS.MemoryQOS.Priority, _ = system.CgroupFileReadInt(parentDir, system.MemPriority) + resourceQoS.MemoryQOS.OomKillGroup, _ = system.CgroupFileReadInt(parentDir, system.MemOomGroup) // assume NONE cfg equals to disabled - memoryQoSDisabled := reflect.DeepEqual(util.NoneMemoryQoS(), &resourceQoS.MemoryQoS) - resourceQoS.MemoryQoS.Enable = pointer.BoolPtr(!memoryQoSDisabled) + memoryQoSDisabled := reflect.DeepEqual(util.NoneMemoryQOS(), &resourceQoS.MemoryQOS) + resourceQoS.MemoryQOS.Enable = pointer.BoolPtr(!memoryQoSDisabled) return resourceQoS } -func writeMemToCgroupFile(parentDir string, qos *slov1alpha1.ResourceQoS, helper *system.FileTestUtil) { +func writeMemToCgroupFile(parentDir string, qos *slov1alpha1.ResourceQOS, helper *system.FileTestUtil) { helper.WriteCgroupFileContents(parentDir, system.MemMin, "0") helper.WriteCgroupFileContents(parentDir, system.MemLow, "0") helper.WriteCgroupFileContents(parentDir, system.MemHigh, strconv.FormatInt(math.MaxInt64, 10)) - helper.WriteCgroupFileContents(parentDir, system.MemWmarkRatio, strconv.FormatInt(*qos.MemoryQoS.WmarkRatio, 10)) - helper.WriteCgroupFileContents(parentDir, system.MemWmarkMinAdj, strconv.FormatInt(*qos.MemoryQoS.WmarkMinAdj, 10)) - helper.WriteCgroupFileContents(parentDir, system.MemWmarkScaleFactor, strconv.FormatInt(*qos.MemoryQoS.WmarkScalePermill, 10)) - helper.WriteCgroupFileContents(parentDir, system.MemUsePriorityOom, strconv.FormatInt(*qos.MemoryQoS.PriorityEnable, 10)) - helper.WriteCgroupFileContents(parentDir, system.MemPriority, strconv.FormatInt(*qos.MemoryQoS.Priority, 10)) - helper.WriteCgroupFileContents(parentDir, system.MemOomGroup, strconv.FormatInt(*qos.MemoryQoS.OomKillGroup, 10)) + helper.WriteCgroupFileContents(parentDir, system.MemWmarkRatio, strconv.FormatInt(*qos.MemoryQOS.WmarkRatio, 10)) + helper.WriteCgroupFileContents(parentDir, system.MemWmarkMinAdj, strconv.FormatInt(*qos.MemoryQOS.WmarkMinAdj, 10)) + helper.WriteCgroupFileContents(parentDir, system.MemWmarkScaleFactor, strconv.FormatInt(*qos.MemoryQOS.WmarkScalePermill, 10)) + helper.WriteCgroupFileContents(parentDir, system.MemUsePriorityOom, strconv.FormatInt(*qos.MemoryQOS.PriorityEnable, 10)) + helper.WriteCgroupFileContents(parentDir, system.MemPriority, strconv.FormatInt(*qos.MemoryQOS.Priority, 10)) + helper.WriteCgroupFileContents(parentDir, system.MemOomGroup, strconv.FormatInt(*qos.MemoryQOS.OomKillGroup, 10)) } diff --git a/pkg/koordlet/resmanager/config.go b/pkg/koordlet/resmanager/config.go index 1f660be88..a4a991ce2 100644 --- a/pkg/koordlet/resmanager/config.go +++ b/pkg/koordlet/resmanager/config.go @@ -41,10 +41,10 @@ func NewDefaultConfig() *Config { } func (c *Config) InitFlags(fs *flag.FlagSet) { - fs.IntVar(&c.ReconcileIntervalSeconds, "ReconcileIntervalSeconds", c.ReconcileIntervalSeconds, "reconcile be pod cgroup interval by seconds") - fs.IntVar(&c.CPUSuppressIntervalSeconds, "CPUSuppressIntervalSeconds", c.CPUSuppressIntervalSeconds, "suppress be pod cpu resource interval by seconds") - fs.IntVar(&c.CPUEvictIntervalSeconds, "CPUEvictIntervalSeconds", c.CPUEvictIntervalSeconds, "evict be pod(cpu) interval by seconds") - fs.IntVar(&c.MemoryEvictIntervalSeconds, "MemoryEvictIntervalSeconds", c.MemoryEvictIntervalSeconds, "evict be pod(memory) interval by seconds") - fs.IntVar(&c.MemoryEvictCoolTimeSeconds, "MemoryEvictCoolTimeSeconds", c.MemoryEvictCoolTimeSeconds, "cooling time: memory next evict time should after lastEvictTime + MemoryEvictCoolTimeSeconds") - fs.IntVar(&c.CPUEvictCoolTimeSeconds, "CPUEvictCoolTimeSeconds", c.CPUEvictCoolTimeSeconds, "cooltime: CPU next evict time should after lastEvictTime + CPUEvictCoolTimeSeconds") + fs.IntVar(&c.ReconcileIntervalSeconds, "reconcile-interval-seconds", c.ReconcileIntervalSeconds, "reconcile be pod cgroup interval by seconds") + fs.IntVar(&c.CPUSuppressIntervalSeconds, "cpu-suppress-interval-seconds", c.CPUSuppressIntervalSeconds, "suppress be pod cpu resource interval by seconds") + fs.IntVar(&c.CPUEvictIntervalSeconds, "cpu-evict-interval-seconds", c.CPUEvictIntervalSeconds, "evict be pod(cpu) interval by seconds") + fs.IntVar(&c.MemoryEvictIntervalSeconds, "memory-evict-interval-seconds", c.MemoryEvictIntervalSeconds, "evict be pod(memory) interval by seconds") + fs.IntVar(&c.MemoryEvictCoolTimeSeconds, "memory-evict-cool-time-seconds", c.MemoryEvictCoolTimeSeconds, "cooling time: memory next evict time should after lastEvictTime + MemoryEvictCoolTimeSeconds") + fs.IntVar(&c.CPUEvictCoolTimeSeconds, "cpu-evict-cool-time-seconds", c.CPUEvictCoolTimeSeconds, "cooltime: CPU next evict time should after lastEvictTime + CPUEvictCoolTimeSeconds") } diff --git a/pkg/koordlet/resmanager/config_test.go b/pkg/koordlet/resmanager/config_test.go new file mode 100644 index 000000000..2354f317f --- /dev/null +++ b/pkg/koordlet/resmanager/config_test.go @@ -0,0 +1,96 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resmanager + +import ( + "flag" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_NewDefaultConfig(t *testing.T) { + expectConfig := &Config{ + ReconcileIntervalSeconds: 1, + CPUSuppressIntervalSeconds: 1, + CPUEvictIntervalSeconds: 1, + MemoryEvictIntervalSeconds: 1, + MemoryEvictCoolTimeSeconds: 4, + CPUEvictCoolTimeSeconds: 20, + } + defaultConfig := NewDefaultConfig() + assert.Equal(t, expectConfig, defaultConfig) +} + +func Test_InitFlags(t *testing.T) { + cmdArgs := []string{ + "", + "--reconcile-interval-seconds=2", + "--cpu-suppress-interval-seconds=2", + "--cpu-evict-interval-seconds=2", + "--memory-evict-interval-seconds=2", + "--memory-evict-cool-time-seconds=8", + "--cpu-evict-cool-time-seconds=40", + } + fs := flag.NewFlagSet(cmdArgs[0], flag.ExitOnError) + + type fields struct { + ReconcileIntervalSeconds int + CPUSuppressIntervalSeconds int + CPUEvictIntervalSeconds int + MemoryEvictIntervalSeconds int + MemoryEvictCoolTimeSeconds int + CPUEvictCoolTimeSeconds int + } + type args struct { + fs *flag.FlagSet + } + tests := []struct { + name string + fields fields + args args + }{ + { + name: "not default", + fields: fields{ + ReconcileIntervalSeconds: 2, + CPUSuppressIntervalSeconds: 2, + CPUEvictIntervalSeconds: 2, + MemoryEvictIntervalSeconds: 2, + MemoryEvictCoolTimeSeconds: 8, + CPUEvictCoolTimeSeconds: 40, + }, + args: args{fs: fs}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + raw := &Config{ + ReconcileIntervalSeconds: tt.fields.ReconcileIntervalSeconds, + CPUSuppressIntervalSeconds: tt.fields.CPUSuppressIntervalSeconds, + CPUEvictIntervalSeconds: tt.fields.CPUEvictIntervalSeconds, + MemoryEvictIntervalSeconds: tt.fields.MemoryEvictIntervalSeconds, + MemoryEvictCoolTimeSeconds: tt.fields.MemoryEvictCoolTimeSeconds, + CPUEvictCoolTimeSeconds: tt.fields.CPUEvictCoolTimeSeconds, + } + c := NewDefaultConfig() + c.InitFlags(tt.args.fs) + tt.args.fs.Parse(cmdArgs[1:]) + assert.Equal(t, raw, c) + }) + } +} diff --git a/pkg/koordlet/resmanager/cpu_burst.go b/pkg/koordlet/resmanager/cpu_burst.go index bcd0ddd60..6e7defa15 100644 --- a/pkg/koordlet/resmanager/cpu_burst.go +++ b/pkg/koordlet/resmanager/cpu_burst.go @@ -509,12 +509,16 @@ func (b *CPUBurst) applyCPUBurst(burstCfg *slov1alpha1.CPUBurstConfig, podMeta * containerCFSBurstValStr := strconv.FormatInt(containerCFSBurstVal, 10) updater := NewCommonCgroupResourceUpdater(ownerRef, containerDir, system.CPUBurst, containerCFSBurstValStr) updated, err := b.executor.UpdateByCache(updater) - if err != nil { + if err == nil { + klog.V(5).Infof("apply container %v/%v/%v cpu burst value success, dir %v, value %v", + pod.Namespace, pod.Name, containerStat.Name, containerDir, containerCFSBurstVal) + } else if system.HostSystemInfo.IsAnolisOS { + // cgroup `cpu.burst_us` is expected available on anolis os, and it may not exist in other kernels. klog.Infof("update container %v/%v/%v cpu burst failed, dir %v, updated %v, error %v", pod.Namespace, pod.Name, containerStat.Name, containerDir, updated, err) } else { - klog.V(5).Infof("apply container %v/%v/%v cpu burst value success, dir %v, value %v", - pod.Namespace, pod.Name, containerStat.Name, containerDir, containerCFSBurstVal) + klog.V(4).Infof("update container %v/%v/%v cpu burst ignored on non Anolis OS, dir %v, "+ + "updated %v, info %v", pod.Namespace, pod.Name, containerStat.Name, containerDir, updated, err) } } } // end for containers @@ -525,12 +529,16 @@ func (b *CPUBurst) applyCPUBurst(burstCfg *slov1alpha1.CPUBurstConfig, podMeta * podCFSBurstValStr := strconv.FormatInt(podCFSBurstVal, 10) updater := NewCommonCgroupResourceUpdater(ownerRef, podDir, system.CPUBurst, podCFSBurstValStr) updated, err := b.executor.UpdateByCache(updater) - if err != nil { + if err == nil { + klog.V(5).Infof("apply pod %v/%v cpu burst value success, dir %v, value %v", + pod.Namespace, pod.Name, podDir, podCFSBurstValStr) + } else if system.HostSystemInfo.IsAnolisOS { + // cgroup `cpu.burst_us` is expected available on anolis os, and it may not exist in other kernels. klog.Infof("update pod %v/%v cpu burst failed, dir %v, updated %v, error %v", pod.Namespace, pod.Name, podDir, updated, err) } else { - klog.V(5).Infof("apply pod %v/%v cpu burst value success, dir %v, value %v", - pod.Namespace, pod.Name, podDir, podCFSBurstValStr) + klog.V(4).Infof("update pod %v/%v cpu burst ignored on non Anolis OS, dir %v, updated %v, "+ + "info %v", pod.Namespace, pod.Name, podDir, updated, err) } } } diff --git a/pkg/koordlet/resmanager/cpu_suppress.go b/pkg/koordlet/resmanager/cpu_suppress.go index 64e62775d..d57684ee1 100644 --- a/pkg/koordlet/resmanager/cpu_suppress.go +++ b/pkg/koordlet/resmanager/cpu_suppress.go @@ -27,6 +27,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" apiext "github.com/koordinator-sh/koordinator/apis/extension" slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1" @@ -158,98 +159,111 @@ func (r *CPUSuppress) calculateBESuppressCPU(node *corev1.Node, nodeMetric *metr } // calculateBESuppressPolicy calculates the be cpu suppress policy with cpuset cpus number and node cpu info -func calculateBESuppressCPUSetPolicy(cpusetQuantity *resource.Quantity, oldCPUSetNum int, nodeCPUInfo *metriccache.NodeCPUInfo) []int32 { - // set the number of cpuset cpus no less than 2 - cpus := int32(math.Ceil(float64(cpusetQuantity.MilliValue()) / 1000)) - if cpus < 2 { - cpus = 2 - } - +func calculateBESuppressCPUSetPolicy(cpus int32, processorInfos []util.ProcessorInfo) []int32 { var CPUSets []int32 - - numProcessors := int32(len(nodeCPUInfo.ProcessorInfos)) + numProcessors := int32(len(processorInfos)) if numProcessors < cpus { klog.Warningf("failed to calculate a proper suppress policy, available cpus is not enough, "+ "please check the related resource metrics: want cpus %v but got %v", cpus, numProcessors) return CPUSets } - beMaxIncreaseCpuNum := int32(math.Ceil(float64(numProcessors) * beMaxIncreaseCPUPercent)) - if cpus-int32(oldCPUSetNum) > beMaxIncreaseCpuNum { - cpus = int32(oldCPUSetNum) + beMaxIncreaseCpuNum - } - - // NOTE: here we prioritize the processors by the affinity of numa node/socket and hyper-threading so that the - // cpuset cpus can be picked in order, rather than scoring all the possible policies. - // TBD: may just use a lookup table for a steady state - - // initially sorted by processor id - prioritizedCPUs := nodeCPUInfo.ProcessorInfos - sort.Slice(prioritizedCPUs, func(i, j int) bool { - return prioritizedCPUs[i].CPUID < prioritizedCPUs[j].CPUID - }) - // getNodeIndex is a function to calculate an index for every numa node or socket getNodeIndex := func(info util.ProcessorInfo) int32 { + // (nodeId, socketId) => nodeIndex return (info.NodeID + numProcessors) * (info.SocketID + 1) } + cpuBucketOfNode := map[int32][]util.ProcessorInfo{} + for _, p := range processorInfos { + nodeIndex := getNodeIndex(p) + cpuBucketOfNode[nodeIndex] = append(cpuBucketOfNode[nodeIndex], p) + } - // get the relative indexes of the processor on its node - indexOfNode := map[int32]int32{} - numProcessorsOfNode := map[int32]int32{} - for _, processor := range prioritizedCPUs { - numProcessorsOfNode[getNodeIndex(processor)]++ - if indexOfNode[processor.CoreID] <= 0 { - indexOfNode[processor.CoreID] = numProcessorsOfNode[getNodeIndex(processor)] - } + // change cpuBucket map to array + cpuBucket := [][]util.ProcessorInfo{} + for _, processorInfos := range cpuBucketOfNode { + cpuBucket = append(cpuBucket, processorInfos) } - // first we prefer the scattered node/socket, secondly the same core - // e.g. - // for a processorInfo/prioritizedCPUs list: - // CPUID CoreID SocketID NodeID - // 0 0 0 0 - // 1 1 0 0 - // 2 2 1 1 - // 3 3 1 1 - // 4 0 0 0 - // 5 1 0 0 - // 6 2 1 1 - // 7 3 1 1 - // it will get sorted as below: - // CPUID CoreID SocketID NodeID - // 7 3 1 1 - // 3 3 1 1 - // 5 1 0 0 - // 1 1 0 0 - // 6 2 1 1 - // 2 2 1 1 - // 4 0 0 0 - // 0 0 0 0 - sort.Slice(prioritizedCPUs, func(i, j int) bool { - a, b := prioritizedCPUs[i], prioritizedCPUs[j] - if indexOfNode[a.CoreID] != indexOfNode[b.CoreID] { - return indexOfNode[a.CoreID] > indexOfNode[b.CoreID] - } - if numProcessorsOfNode[getNodeIndex(a)] != numProcessorsOfNode[getNodeIndex(b)] { - return numProcessorsOfNode[getNodeIndex(a)] > numProcessorsOfNode[getNodeIndex(b)] - } - if a.CoreID != b.CoreID { - return a.CoreID > b.CoreID + for index := range cpuBucket { + sort.Slice(cpuBucket[index], func(i, j int) bool { + if cpuBucket[index][i].CoreID == cpuBucket[index][j].CoreID { + return cpuBucket[index][i].CPUID < cpuBucket[index][j].CPUID + } + return cpuBucket[index][i].CoreID < cpuBucket[index][j].CoreID + }) + } + + sort.Slice(cpuBucket, func(i, j int) bool { + if len(cpuBucket[i]) == len(cpuBucket[j]) { + return cpuBucket[i][0].CPUID < cpuBucket[j][0].CPUID } - return a.CPUID > b.CPUID + return len(cpuBucket[i]) > len(cpuBucket[j]) }) needCPUs := cpus - for i := range prioritizedCPUs { + usedCpu := map[int32]bool{} + // select same core cpu id + preNeedCpus := int32(-1) + i := 0 + for ; i < len(cpuBucket); i = (i + 1) % len(cpuBucket) { + if needCPUs <= 1 { + break + } + if i == 0 { + // if we don't pick any cpu, we need break this cycle + if preNeedCpus == needCPUs { + break + } + preNeedCpus = needCPUs + } + selectdIndex := -1 + for j := 0; j < len(cpuBucket[i])-1; j++ { + if usedCpu[cpuBucket[i][j].CPUID] { + continue + } + if cpuBucket[i][j].CoreID == cpuBucket[i][j+1].CoreID { + selectdIndex = j + break + } + } + if selectdIndex != -1 { + CPUSets = append(CPUSets, cpuBucket[i][selectdIndex].CPUID, cpuBucket[i][selectdIndex+1].CPUID) + usedCpu[cpuBucket[i][selectdIndex].CPUID] = true + usedCpu[cpuBucket[i][selectdIndex+1].CPUID] = true + needCPUs = needCPUs - 2 + } + } + + // select single cpu id + preNeedCpus = int32(-1) + startIndex := i + for ; i < len(cpuBucket); i = (i + 1) % len(cpuBucket) { if needCPUs <= 0 { break } - CPUSets = append(CPUSets, prioritizedCPUs[i].CPUID) - needCPUs-- + if i == startIndex { + // if we don't pick any cpu, we need break this cycle + if preNeedCpus == needCPUs { + break + } + preNeedCpus = needCPUs + } + selectdIndex := -1 + for j := 0; j < len(cpuBucket[i]); j++ { + if usedCpu[cpuBucket[i][j].CPUID] { + continue + } + selectdIndex = j + break + } + if selectdIndex != -1 { + CPUSets = append(CPUSets, cpuBucket[i][selectdIndex].CPUID) + usedCpu[cpuBucket[i][selectdIndex].CPUID] = true + needCPUs-- + } } klog.Infof("calculated BE suppress policy: cpuset %v", CPUSets) - return CPUSets } @@ -341,20 +355,69 @@ func (r *CPUSuppress) suppressBECPU() { r.suppressPolicyStatuses[string(slov1alpha1.CPUCfsQuotaPolicy)] = policyUsing r.recoverCPUSetIfNeed() } else { - adjustByCPUSet(suppressCPUQuantity, nodeCPUInfo) + r.adjustByCPUSet(suppressCPUQuantity, nodeCPUInfo) r.suppressPolicyStatuses[string(slov1alpha1.CPUSetPolicy)] = policyUsing r.recoverCFSQuotaIfNeed() } } -func adjustByCPUSet(cpusetQuantity *resource.Quantity, nodeCPUInfo *metriccache.NodeCPUInfo) { +func (r *CPUSuppress) adjustByCPUSet(cpusetQuantity *resource.Quantity, nodeCPUInfo *metriccache.NodeCPUInfo) { oldCPUSet, err := util.GetRootCgroupCurCPUSet(corev1.PodQOSBestEffort) if err != nil { klog.Warningf("applyBESuppressPolicy failed to get current best-effort cgroup cpuset, err: %s", err) return } - beCPUSet := calculateBESuppressCPUSetPolicy(cpusetQuantity, len(oldCPUSet), nodeCPUInfo) + podMetas := r.resmanager.statesInformer.GetAllPods() + // value: 0 -> lse, 1 -> lsr, not exists -> others + cpuIdToPool := map[int32]apiext.QoSClass{} + for _, podMeta := range podMetas { + alloc, err := apiext.GetResourceStatus(podMeta.Pod.Annotations) + if err != nil { + continue + } + if alloc.CPUSet != "" { + set, err := cpuset.Parse(alloc.CPUSet) + if err != nil { + klog.Errorf("failed to parse cpuset info of pod %s, err: %v", podMeta.Pod.Name, err) + continue + } + for _, cpuID := range set.ToSliceNoSort() { + cpuIdToPool[int32(cpuID)] = apiext.GetPodQoSClass(podMeta.Pod) + } + } + } + lsrCpus := []util.ProcessorInfo{} + lsCpus := []util.ProcessorInfo{} + // FIXME: be pods might be starved since lse pods can run out of all cpus + for _, processor := range nodeCPUInfo.ProcessorInfos { + if cpuIdToPool[processor.CPUID] == apiext.QoSLSR { + lsrCpus = append(lsrCpus, processor) + } else if cpuIdToPool[processor.CPUID] != apiext.QoSLSE { + lsCpus = append(lsCpus, processor) + } + } + + // set the number of cpuset cpus no less than 2 + cpus := int32(math.Ceil(float64(cpusetQuantity.MilliValue()) / 1000)) + if cpus < 2 { + cpus = 2 + } + beMaxIncreaseCpuNum := int32(math.Ceil(float64(len(nodeCPUInfo.ProcessorInfos)) * beMaxIncreaseCPUPercent)) + if cpus-int32(len(oldCPUSet)) > beMaxIncreaseCpuNum { + cpus = int32(len(oldCPUSet)) + beMaxIncreaseCpuNum + } + var beCPUSet []int32 + lsrCpuNums := int32(int(cpus) * len(lsrCpus) / (len(lsrCpus) + len(lsCpus))) + + if lsrCpuNums > 0 { + beCPUSetFromLSR := calculateBESuppressCPUSetPolicy(lsrCpuNums, lsrCpus) + beCPUSet = append(beCPUSet, beCPUSetFromLSR...) + } + if cpus-lsrCpuNums > 0 { + beCPUSetFromLS := calculateBESuppressCPUSetPolicy(cpus-lsrCpuNums, lsCpus) + beCPUSet = append(beCPUSet, beCPUSetFromLS...) + } // the new be suppress always need to apply since: // - for a reduce of BE cpuset, we should make effort to protecting LS no matter how huge the decrease is; @@ -374,19 +437,49 @@ func (r *CPUSuppress) recoverCPUSetIfNeed() { return } - rootCPUSet, err := util.GetRootCgroupCurCPUSet(corev1.PodQOSGuaranteed) + cpus := []int{} + nodeInfo, err := r.resmanager.metricCache.GetNodeCPUInfo(&metriccache.QueryParam{}) if err != nil { - klog.Warningf("recover bestEffort cpuset failed, get current root cgroup cpuset err: %s", err) return } + for _, p := range nodeInfo.ProcessorInfos { + cpus = append(cpus, int(p.CPUID)) + } + + beCPUSet := cpuset.NewCPUSet(cpus...) + lseCPUID := make(map[int]bool) + podMetas := r.resmanager.statesInformer.GetAllPods() + for _, podMeta := range podMetas { + alloc, err := apiext.GetResourceStatus(podMeta.Pod.Annotations) + if err != nil { + continue + } + if apiext.GetPodQoSClass(podMeta.Pod) != apiext.QoSLSE { + continue + } + if alloc.CPUSet != "" { + set, err := cpuset.Parse(alloc.CPUSet) + if err != nil { + klog.Errorf("failed to parse cpuset info of pod %s, err: %v", podMeta.Pod.Name, err) + continue + } + for _, cpuID := range set.ToSliceNoSort() { + lseCPUID[cpuID] = true + } + } + } + beCPUSet.Filter(func(ID int) bool { + return !lseCPUID[ID] + }) + cpusetCgroupPaths, err := getBECgroupCPUSetPathsRecursive() if err != nil { klog.Warningf("recover bestEffort cpuset failed, get be cgroup cpuset paths err: %s", err) return } - cpusetStr := util.GenerateCPUSetStr(rootCPUSet) - klog.V(6).Infof("recover bestEffort cpuset, cpuset %v", rootCPUSet) + cpusetStr := beCPUSet.String() + klog.V(6).Infof("recover bestEffort cpuset, cpuset %v", cpusetStr) writeBECgroupsCPUSet(cpusetCgroupPaths, cpusetStr, false) r.suppressPolicyStatuses[string(slov1alpha1.CPUSetPolicy)] = policyRecovered } diff --git a/pkg/koordlet/resmanager/cpu_suppress_test.go b/pkg/koordlet/resmanager/cpu_suppress_test.go index 8c502af40..2f0dcb004 100644 --- a/pkg/koordlet/resmanager/cpu_suppress_test.go +++ b/pkg/koordlet/resmanager/cpu_suppress_test.go @@ -188,7 +188,7 @@ func Test_cpuSuppress_suppressBECPU(t *testing.T) { }, wantBECFSQuota: 3.2 * defaultCFSPeriod, wantCFSQuotaPolicyStatus: &policyUsing, - wantBECPUSet: "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15", + wantBECPUSet: "0-15", wantCPUSetPolicyStatus: &policyRecovered, }, { @@ -312,7 +312,7 @@ func Test_cpuSuppress_suppressBECPU(t *testing.T) { }, wantBECFSQuota: 1.2 * defaultCFSPeriod, wantCFSQuotaPolicyStatus: &policyUsing, - wantBECPUSet: "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15", + wantBECPUSet: "0-15", wantCPUSetPolicyStatus: &policyRecovered, }, { @@ -436,7 +436,7 @@ func Test_cpuSuppress_suppressBECPU(t *testing.T) { }, wantBECFSQuota: -1, wantCFSQuotaPolicyStatus: &policyRecovered, - wantBECPUSet: "15,14", + wantBECPUSet: "0,1", wantCPUSetPolicyStatus: &policyUsing, }, { @@ -560,7 +560,7 @@ func Test_cpuSuppress_suppressBECPU(t *testing.T) { }, wantBECFSQuota: -1, wantCFSQuotaPolicyStatus: &policyRecovered, - wantBECPUSet: "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15", + wantBECPUSet: "0-15", wantCPUSetPolicyStatus: &policyRecovered, }, } @@ -872,9 +872,28 @@ func Test_cpuSuppress_calculateBESuppressCPU(t *testing.T) { func Test_cpuSuppress_recoverCPUSetIfNeed(t *testing.T) { type args struct { oldCPUSets string - rootCPUSets string currentPolicyStatus *suppressPolicyStatus } + mockNodeInfo := metriccache.NodeCPUInfo{ + ProcessorInfos: []util.ProcessorInfo{ + {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 1}, + {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 1}, + {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 8, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 9, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 10, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 11, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 12, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 13, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 14, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 15, CoreID: 3, SocketID: 1, NodeID: 1}, + }, + } tests := []struct { name string args args @@ -885,27 +904,24 @@ func Test_cpuSuppress_recoverCPUSetIfNeed(t *testing.T) { name: "test need recover. currentPolicyStatus is nil", args: args{ oldCPUSets: "7,6,3,2", - rootCPUSets: "0-15", currentPolicyStatus: nil, }, - wantCPUSet: "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15", + wantCPUSet: "0-15", wantPolicyStatus: &policyRecovered, }, { name: "test need recover. currentPolicyStatus is policyUsing", args: args{ oldCPUSets: "7,6,3,2", - rootCPUSets: "0-15", currentPolicyStatus: &policyUsing, }, - wantCPUSet: "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15", + wantCPUSet: "0-15", wantPolicyStatus: &policyRecovered, }, { name: "test not need recover. currentPolicyStatus is policyRecovered", args: args{ oldCPUSets: "7,6,3,2", - rootCPUSets: "0-15", currentPolicyStatus: &policyRecovered, }, wantCPUSet: "7,6,3,2", @@ -918,10 +934,17 @@ func Test_cpuSuppress_recoverCPUSetIfNeed(t *testing.T) { helper := system.NewFileTestUtil(t) podDirs := []string{"pod1", "pod2", "pod3"} testingPrepareBECgroupData(helper, podDirs, tt.args.oldCPUSets) - helper.WriteCgroupFileContents(util.GetKubeQosRelativePath(corev1.PodQOSGuaranteed), system.CPUSet, tt.args.rootCPUSets) - - r := resmanager{} - cpuSuppress := NewCPUSuppress(&r) + lsePod := mockLSEPod() + ctl := gomock.NewController(t) + mockMetricCache := mockmetriccache.NewMockMetricCache(ctl) + mockStatesInformer := mockstatesinformer.NewMockStatesInformer(ctl) + mockStatesInformer.EXPECT().GetAllPods().Return([]*statesinformer.PodMeta{{Pod: lsePod}}).AnyTimes() + mockMetricCache.EXPECT().GetNodeCPUInfo(gomock.Any()).Return(&mockNodeInfo, nil).AnyTimes() + r := &resmanager{ + statesInformer: mockStatesInformer, + metricCache: mockMetricCache, + } + cpuSuppress := NewCPUSuppress(r) if tt.args.currentPolicyStatus != nil { cpuSuppress.suppressPolicyStatuses[string(slov1alpha1.CPUSetPolicy)] = *tt.args.currentPolicyStatus } @@ -992,9 +1015,9 @@ func Test_cpuSuppress_recoverCFSQuotaIfNeed(t *testing.T) { func Test_calculateBESuppressCPUSetPolicy(t *testing.T) { type args struct { - cpusetQuantity *resource.Quantity - nodeCPUInfo *metriccache.NodeCPUInfo - oldCPUSetNum int + cpus int32 + processorInfo []util.ProcessorInfo + oldCPUSetNum int } tests := []struct { name string @@ -1004,186 +1027,135 @@ func Test_calculateBESuppressCPUSetPolicy(t *testing.T) { { name: "do not panic but return empty cpuset for insufficient cpus", args: args{ - cpusetQuantity: resource.NewQuantity(0, resource.DecimalSI), - nodeCPUInfo: &metriccache.NodeCPUInfo{}, - oldCPUSetNum: 0, + cpus: 0, + processorInfo: []util.ProcessorInfo{}, + oldCPUSetNum: 0, }, want: nil, }, - { - name: "at least allocate 2 cpus", - args: args{ - cpusetQuantity: resource.NewQuantity(0, resource.DecimalSI), - nodeCPUInfo: &metriccache.NodeCPUInfo{ - ProcessorInfos: []util.ProcessorInfo{ - {CPUID: 6, CoreID: 3, SocketID: 0, NodeID: 0}, - {CPUID: 7, CoreID: 3, SocketID: 0, NodeID: 0}, - {CPUID: 8, CoreID: 4, SocketID: 0, NodeID: 0}, - {CPUID: 9, CoreID: 4, SocketID: 0, NodeID: 0}, - {CPUID: 10, CoreID: 5, SocketID: 0, NodeID: 0}, - {CPUID: 11, CoreID: 5, SocketID: 0, NodeID: 0}, - }, - }, - oldCPUSetNum: 2, - }, - want: []int32{11, 10}, - }, { name: "allocate cpus with scattering on numa nodes and stacking on HTs 0.", args: args{ - cpusetQuantity: resource.NewQuantity(3, resource.DecimalSI), - nodeCPUInfo: &metriccache.NodeCPUInfo{ - ProcessorInfos: []util.ProcessorInfo{ - {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 1, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 2, CoreID: 2, SocketID: 1, NodeID: 1}, - {CPUID: 3, CoreID: 3, SocketID: 1, NodeID: 1}, - {CPUID: 4, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 5, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 6, CoreID: 2, SocketID: 1, NodeID: 1}, - {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 1}, - }, + cpus: 3, + processorInfo: []util.ProcessorInfo{ + {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 1, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 2, CoreID: 2, SocketID: 1, NodeID: 1}, + {CPUID: 3, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 4, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 5, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 6, CoreID: 2, SocketID: 1, NodeID: 1}, + {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 1}, }, + oldCPUSetNum: 3, }, - want: []int32{7, 3, 5}, + want: []int32{0, 4, 2}, }, { name: "allocate cpus with scattering on numa nodes and stacking on HTs 1.", args: args{ - cpusetQuantity: resource.NewQuantity(3, resource.DecimalSI), - nodeCPUInfo: &metriccache.NodeCPUInfo{ - ProcessorInfos: []util.ProcessorInfo{ - {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 1}, - {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 1}, - {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 1}, - {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 1}, - }, + cpus: 3, + processorInfo: []util.ProcessorInfo{ + {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 1}, + {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 1}, + {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 1}, + {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 1}, }, + oldCPUSetNum: 5, }, - want: []int32{7, 6, 3}, + want: []int32{0, 1, 4}, }, { name: "allocate cpus with scattering on numa nodes and stacking on HTs 2. (also scattering on sockets)", args: args{ - cpusetQuantity: resource.NewQuantity(5, resource.DecimalSI), - nodeCPUInfo: &metriccache.NodeCPUInfo{ - ProcessorInfos: []util.ProcessorInfo{ - {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 0}, - {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 0}, - {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 0}, - {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 0}, - {CPUID: 8, CoreID: 4, SocketID: 2, NodeID: 1}, - {CPUID: 9, CoreID: 4, SocketID: 2, NodeID: 1}, - {CPUID: 10, CoreID: 5, SocketID: 2, NodeID: 1}, - {CPUID: 11, CoreID: 5, SocketID: 2, NodeID: 1}, - {CPUID: 12, CoreID: 6, SocketID: 3, NodeID: 1}, - {CPUID: 13, CoreID: 6, SocketID: 3, NodeID: 1}, - {CPUID: 14, CoreID: 7, SocketID: 3, NodeID: 1}, - {CPUID: 15, CoreID: 7, SocketID: 3, NodeID: 1}, - }, + cpus: 5, + processorInfo: []util.ProcessorInfo{ + {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 0}, + {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 0}, + {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 0}, + {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 0}, + {CPUID: 8, CoreID: 4, SocketID: 2, NodeID: 1}, + {CPUID: 9, CoreID: 4, SocketID: 2, NodeID: 1}, + {CPUID: 10, CoreID: 5, SocketID: 2, NodeID: 1}, + {CPUID: 11, CoreID: 5, SocketID: 2, NodeID: 1}, + {CPUID: 12, CoreID: 6, SocketID: 3, NodeID: 1}, + {CPUID: 13, CoreID: 6, SocketID: 3, NodeID: 1}, + {CPUID: 14, CoreID: 7, SocketID: 3, NodeID: 1}, + {CPUID: 15, CoreID: 7, SocketID: 3, NodeID: 1}, }, + oldCPUSetNum: 8, }, - want: []int32{15, 14, 11, 10, 7}, + want: []int32{0, 1, 4, 5, 8}, }, { name: "allocate cpus with scattering on numa nodes and stacking on HTs 3. (regardless of the initial order)", args: args{ - cpusetQuantity: resource.NewQuantity(5, resource.DecimalSI), - nodeCPUInfo: &metriccache.NodeCPUInfo{ - ProcessorInfos: []util.ProcessorInfo{ - {CPUID: 12, CoreID: 6, SocketID: 3, NodeID: 1}, - {CPUID: 13, CoreID: 6, SocketID: 3, NodeID: 1}, - {CPUID: 14, CoreID: 7, SocketID: 3, NodeID: 1}, - {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 11, CoreID: 5, SocketID: 2, NodeID: 1}, - {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 0}, - {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 0}, - {CPUID: 8, CoreID: 4, SocketID: 2, NodeID: 1}, - {CPUID: 15, CoreID: 7, SocketID: 3, NodeID: 1}, - {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 9, CoreID: 4, SocketID: 2, NodeID: 1}, - {CPUID: 10, CoreID: 5, SocketID: 2, NodeID: 1}, - {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 0}, - {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 0}, - }, + cpus: 5, + processorInfo: []util.ProcessorInfo{ + {CPUID: 12, CoreID: 6, SocketID: 3, NodeID: 1}, + {CPUID: 13, CoreID: 6, SocketID: 3, NodeID: 1}, + {CPUID: 14, CoreID: 7, SocketID: 3, NodeID: 1}, + {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 11, CoreID: 5, SocketID: 2, NodeID: 1}, + {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 0}, + {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 0}, + {CPUID: 8, CoreID: 4, SocketID: 2, NodeID: 1}, + {CPUID: 15, CoreID: 7, SocketID: 3, NodeID: 1}, + {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 9, CoreID: 4, SocketID: 2, NodeID: 1}, + {CPUID: 10, CoreID: 5, SocketID: 2, NodeID: 1}, + {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 0}, + {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 0}, }, + oldCPUSetNum: 8, }, - want: []int32{15, 14, 11, 10, 7}, - }, - { - name: "allocate cpus for slow scale up:increase cpunum > maxIncreaseCPUNum", - args: args{ - cpusetQuantity: resource.NewQuantity(5, resource.DecimalSI), - nodeCPUInfo: &metriccache.NodeCPUInfo{ - ProcessorInfos: []util.ProcessorInfo{ - {CPUID: 12, CoreID: 6, SocketID: 3, NodeID: 1}, - {CPUID: 13, CoreID: 6, SocketID: 3, NodeID: 1}, - {CPUID: 14, CoreID: 7, SocketID: 3, NodeID: 1}, - {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 11, CoreID: 5, SocketID: 2, NodeID: 1}, - {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 0}, - {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 0}, - {CPUID: 8, CoreID: 4, SocketID: 2, NodeID: 1}, - {CPUID: 15, CoreID: 7, SocketID: 3, NodeID: 1}, - {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 9, CoreID: 4, SocketID: 2, NodeID: 1}, - {CPUID: 10, CoreID: 5, SocketID: 2, NodeID: 1}, - {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 0}, - {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 0}, - }, - }, - oldCPUSetNum: 1, // maxNewCPUSet := oldCPUSetNum + beMaxIncreaseCPUPercent*totalCPUNum = 3 - }, - want: []int32{15, 14, 11}, + want: []int32{0, 1, 4, 5, 8}, }, { name: "allocate cpus for slow scale up:increase cpunum == maxIncreaseCPUNum", args: args{ - cpusetQuantity: resource.NewQuantity(5, resource.DecimalSI), - nodeCPUInfo: &metriccache.NodeCPUInfo{ - ProcessorInfos: []util.ProcessorInfo{ - {CPUID: 12, CoreID: 6, SocketID: 3, NodeID: 1}, - {CPUID: 13, CoreID: 6, SocketID: 3, NodeID: 1}, - {CPUID: 14, CoreID: 7, SocketID: 3, NodeID: 1}, - {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, - {CPUID: 11, CoreID: 5, SocketID: 2, NodeID: 1}, - {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 0}, - {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 0}, - {CPUID: 8, CoreID: 4, SocketID: 2, NodeID: 1}, - {CPUID: 15, CoreID: 7, SocketID: 3, NodeID: 1}, - {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, - {CPUID: 9, CoreID: 4, SocketID: 2, NodeID: 1}, - {CPUID: 10, CoreID: 5, SocketID: 2, NodeID: 1}, - {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 0}, - {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 0}, - }, + cpus: 5, + processorInfo: []util.ProcessorInfo{ + {CPUID: 12, CoreID: 6, SocketID: 3, NodeID: 1}, + {CPUID: 13, CoreID: 6, SocketID: 3, NodeID: 1}, + {CPUID: 14, CoreID: 7, SocketID: 3, NodeID: 1}, + {CPUID: 2, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 3, CoreID: 1, SocketID: 0, NodeID: 0}, + {CPUID: 11, CoreID: 5, SocketID: 2, NodeID: 1}, + {CPUID: 4, CoreID: 2, SocketID: 1, NodeID: 0}, + {CPUID: 5, CoreID: 2, SocketID: 1, NodeID: 0}, + {CPUID: 8, CoreID: 4, SocketID: 2, NodeID: 1}, + {CPUID: 15, CoreID: 7, SocketID: 3, NodeID: 1}, + {CPUID: 0, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 1, CoreID: 0, SocketID: 0, NodeID: 0}, + {CPUID: 9, CoreID: 4, SocketID: 2, NodeID: 1}, + {CPUID: 10, CoreID: 5, SocketID: 2, NodeID: 1}, + {CPUID: 6, CoreID: 3, SocketID: 1, NodeID: 0}, + {CPUID: 7, CoreID: 3, SocketID: 1, NodeID: 0}, }, + oldCPUSetNum: 3, }, - want: []int32{15, 14, 11, 10, 7}, + want: []int32{0, 1, 4, 5, 8}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := calculateBESuppressCPUSetPolicy(tt.args.cpusetQuantity, tt.args.oldCPUSetNum, tt.args.nodeCPUInfo) + got := calculateBESuppressCPUSetPolicy(tt.args.cpus, tt.args.processorInfo) assert.Equal(t, tt.want, got) }) } @@ -1256,7 +1228,7 @@ func Test_adjustByCPUSet(t *testing.T) { }, oldCPUSets: "7,6,3,2", }, - wantCPUSet: "7,6,3", + wantCPUSet: "2,3,4", }, { name: "test scale up by cpuset.", @@ -1276,9 +1248,18 @@ func Test_adjustByCPUSet(t *testing.T) { }, oldCPUSets: "7,6", }, - wantCPUSet: "7,6,3", + wantCPUSet: "2,3,4", }, } + ctrl := gomock.NewController(t) + mockStatesInformer := mockstatesinformer.NewMockStatesInformer(ctrl) + lsrPod := mockLSRPod() + lsePod := mockLSEPod() + mockStatesInformer.EXPECT().GetAllPods().Return([]*statesinformer.PodMeta{{Pod: lsrPod}, {Pod: lsePod}}).AnyTimes() + r := &resmanager{ + statesInformer: mockStatesInformer, + } + cpuSuppress := NewCPUSuppress(r) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // prepare testing files @@ -1286,7 +1267,7 @@ func Test_adjustByCPUSet(t *testing.T) { podDirs := []string{"pod1", "pod2", "pod3"} testingPrepareBECgroupData(helper, podDirs, tt.args.oldCPUSets) - adjustByCPUSet(tt.args.cpusetQuantity, tt.args.nodeCPUInfo) + cpuSuppress.adjustByCPUSet(tt.args.cpusetQuantity, tt.args.nodeCPUInfo) gotCPUSetBECgroup := helper.ReadCgroupFileContents(util.GetKubeQosRelativePath(corev1.PodQOSBestEffort), system.CPUSet) assert.Equal(t, tt.wantCPUSet, gotCPUSetBECgroup, "checkBECPUSet") @@ -1412,3 +1393,87 @@ func getNodeSLOByThreshold(thresholdConfig *slov1alpha1.ResourceThresholdStrateg }, } } + +func mockLSRPod() *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-ns", + Name: "test-name-lsr", + UID: "test-pod-uid-lsr", + Labels: map[string]string{ + apiext.LabelPodQoS: string(apiext.QoSLSR), + }, + Annotations: map[string]string{ + apiext.AnnotationResourceStatus: "{\"cpuset\": \"0,6\" }", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container-1", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(500, resource.DecimalSI), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(500, resource.DecimalSI), + }, + }, + }, + { + Name: "test-container-2", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(1000, resource.DecimalSI), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(1000, resource.DecimalSI), + }, + }, + }, + }, + }, + } +} + +func mockLSEPod() *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-ns", + Name: "test-name-lsr", + UID: "test-pod-uid-lsr", + Labels: map[string]string{ + apiext.LabelPodQoS: string(apiext.QoSLSE), + }, + Annotations: map[string]string{ + apiext.AnnotationResourceStatus: "{\"cpuset\": \"7\" }", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container-1", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(500, resource.DecimalSI), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(500, resource.DecimalSI), + }, + }, + }, + { + Name: "test-container-2", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(1000, resource.DecimalSI), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(1000, resource.DecimalSI), + }, + }, + }, + }, + }, + } +} diff --git a/pkg/koordlet/resmanager/memory_evict.go b/pkg/koordlet/resmanager/memory_evict.go index cb077fbb2..7ec628865 100644 --- a/pkg/koordlet/resmanager/memory_evict.go +++ b/pkg/koordlet/resmanager/memory_evict.go @@ -51,11 +51,11 @@ func NewMemoryEvictor(mgr *resmanager) *MemoryEvictor { } func (m *MemoryEvictor) memoryEvict() { - klog.Infof("starting memory evict process") - defer klog.Infof("memory evict process completed") + klog.V(5).Infof("starting memory evict process") + defer klog.V(5).Infof("memory evict process completed") if time.Now().Before(m.lastEvictTime.Add(time.Duration(m.resManager.config.MemoryEvictCoolTimeSeconds) * time.Second)) { - klog.Infof("skip memory evict process, still in evict cooling time") + klog.V(5).Infof("skip memory evict process, still in evict cooling time") return } @@ -110,7 +110,7 @@ func (m *MemoryEvictor) memoryEvict() { nodeMemoryUsage := nodeMetric.MemoryUsed.MemoryWithoutCache.Value() * 100 / memoryCapacity if nodeMemoryUsage < *thresholdPercent { - klog.Infof("skip memory evict, node memory usage(%v) is below threshold(%v)", nodeMemoryUsage, thresholdConfig) + klog.V(5).Infof("skip memory evict, node memory usage(%v) is below threshold(%v)", nodeMemoryUsage, thresholdConfig) return } diff --git a/pkg/koordlet/resmanager/resctrl_reconcile.go b/pkg/koordlet/resmanager/resctrl_reconcile.go index 4b0882e21..e8ef02422 100644 --- a/pkg/koordlet/resmanager/resctrl_reconcile.go +++ b/pkg/koordlet/resmanager/resctrl_reconcile.go @@ -89,17 +89,17 @@ func getPodResctrlGroup(pod *corev1.Pod) string { return UnknownResctrlGroup } -func getResourceQoSForResctrlGroup(strategy *slov1alpha1.ResourceQoSStrategy, group string) *slov1alpha1.ResourceQoS { +func getResourceQOSForResctrlGroup(strategy *slov1alpha1.ResourceQOSStrategy, group string) *slov1alpha1.ResourceQOS { if strategy == nil { return nil } switch group { case LSRResctrlGroup: - return strategy.LSR + return strategy.LSRClass case LSResctrlGroup: - return strategy.LS + return strategy.LSClass case BEResctrlGroup: - return strategy.BE + return strategy.BEClass } return nil } @@ -276,15 +276,15 @@ func calculateL3TasksResource(group string, taskIds []int) ResourceUpdater { } func (r *ResctrlReconcile) calculateAndApplyCatL3PolicyForGroup(group string, cbm uint, l3Num int, - resourceQoS *slov1alpha1.ResourceQoS) error { - if resourceQoS == nil || resourceQoS.ResctrlQoS == nil || resourceQoS.ResctrlQoS.CATRangeStartPercent == nil || - resourceQoS.ResctrlQoS.CATRangeEndPercent == nil { + resourceQoS *slov1alpha1.ResourceQOS) error { + if resourceQoS == nil || resourceQoS.ResctrlQOS == nil || resourceQoS.ResctrlQOS.CATRangeStartPercent == nil || + resourceQoS.ResctrlQOS.CATRangeEndPercent == nil { klog.Warningf("skipped, since resourceQoS or startPercent or endPercent is nil for group %v, "+ "resourceQoS %v", resourceQoS, group) return nil } - startPercent, endPercent := *resourceQoS.ResctrlQoS.CATRangeStartPercent, *resourceQoS.ResctrlQoS.CATRangeEndPercent + startPercent, endPercent := *resourceQoS.ResctrlQOS.CATRangeStartPercent, *resourceQoS.ResctrlQOS.CATRangeEndPercent // calculate policy l3MaskValue, err := calculateCatL3MaskValue(cbm, startPercent, endPercent) if err != nil { @@ -307,14 +307,14 @@ func (r *ResctrlReconcile) calculateAndApplyCatL3PolicyForGroup(group string, cb return nil } -func (r *ResctrlReconcile) calculateAndApplyCatMbPolicyForGroup(group string, l3Num int, resourceQoS *slov1alpha1.ResourceQoS) error { - if resourceQoS == nil || resourceQoS.ResctrlQoS == nil { - klog.Warningf("skipped, since resourceQoS or ResctrlQoS is nil for group %v, "+ +func (r *ResctrlReconcile) calculateAndApplyCatMbPolicyForGroup(group string, l3Num int, resourceQoS *slov1alpha1.ResourceQOS) error { + if resourceQoS == nil || resourceQoS.ResctrlQOS == nil { + klog.Warningf("skipped, since resourceQoS or ResctrlQOS is nil for group %v, "+ "resourceQoS %v", resourceQoS, group) return nil } - memBwPercent := calculateMbaPercentForGroup(group, resourceQoS.ResctrlQoS.MBAPercent) + memBwPercent := calculateMbaPercentForGroup(group, resourceQoS.ResctrlQOS.MBAPercent) if memBwPercent == "" { return nil } @@ -348,7 +348,7 @@ func (r *ResctrlReconcile) calculateAndApplyCatL3GroupTasks(group string, taskId return nil } -func (r *ResctrlReconcile) reconcileCatResctrlPolicy(qosStrategy *slov1alpha1.ResourceQoSStrategy) { +func (r *ResctrlReconcile) reconcileCatResctrlPolicy(qosStrategy *slov1alpha1.ResourceQOSStrategy) { // 1. retrieve rdt configs from nodeSLOSpec // 2.1 get cbm and l3 numbers, which are general for all resctrl groups // 2.2 calculate applying resctrl policies, like cat policy and so on, with each rdt config @@ -385,7 +385,7 @@ func (r *ResctrlReconcile) reconcileCatResctrlPolicy(qosStrategy *slov1alpha1.Re // calculate and apply l3 cat policy for each group for _, group := range resctrlGroupList { - resQoSStrategy := getResourceQoSForResctrlGroup(qosStrategy, group) + resQoSStrategy := getResourceQOSForResctrlGroup(qosStrategy, group) err = r.calculateAndApplyCatL3PolicyForGroup(group, cbm, l3Num, resQoSStrategy) if err != nil { klog.Warningf("failed to apply l3 cat policy for group %v, err: %v", group, err) @@ -397,7 +397,7 @@ func (r *ResctrlReconcile) reconcileCatResctrlPolicy(qosStrategy *slov1alpha1.Re } } -func (r *ResctrlReconcile) reconcileResctrlGroups(qosStrategy *slov1alpha1.ResourceQoSStrategy) { +func (r *ResctrlReconcile) reconcileResctrlGroups(qosStrategy *slov1alpha1.ResourceQOSStrategy) { // 1. retrieve task ids for each slo by reading cgroup task file of every pod container // 2. add the related task ids in resctrl groups @@ -425,7 +425,7 @@ func (r *ResctrlReconcile) reconcileResctrlGroups(qosStrategy *slov1alpha1.Resou // only extension-QoS-specified pod are considered podQoSCfg := getPodResourceQoSByQoSClass(pod, qosStrategy, r.resManager.config) - if podQoSCfg.ResctrlQoS.Enable == nil || !(*podQoSCfg.ResctrlQoS.Enable) { + if podQoSCfg.ResctrlQOS.Enable == nil || !(*podQoSCfg.ResctrlQOS.Enable) { klog.V(5).Infof("pod %v with qos %v disabled resctrl", util.GetPodKey(pod), extension.GetPodQoSClass(pod)) continue } @@ -457,10 +457,10 @@ func (r *ResctrlReconcile) reconcile() { return } nodeSLO := r.resManager.getNodeSLOCopy() - if nodeSLO == nil || nodeSLO.Spec.ResourceQoSStrategy == nil { + if nodeSLO == nil || nodeSLO.Spec.ResourceQOSStrategy == nil { // do nothing if nodeSLO == nil || nodeSLO.spec.ResourceStrategy == nil - klog.Warningf("nodeSLO is nil %v, or nodeSLO.Spec.ResourceQoSStrategy is nil %v", - nodeSLO == nil, nodeSLO.Spec.ResourceQoSStrategy == nil) + klog.Warningf("nodeSLO is nil %v, or nodeSLO.Spec.ResourceQOSStrategy is nil %v", + nodeSLO == nil, nodeSLO.Spec.ResourceQOSStrategy == nil) return } @@ -477,6 +477,6 @@ func (r *ResctrlReconcile) reconcile() { klog.Warningf("ResctrlReconcile failed, cannot initialize cat resctrl group, err: %s", err) return } - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) - r.reconcileResctrlGroups(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) + r.reconcileResctrlGroups(nodeSLO.Spec.ResourceQOSStrategy) } diff --git a/pkg/koordlet/resmanager/resctrl_reconcile_test.go b/pkg/koordlet/resmanager/resctrl_reconcile_test.go index 67c5132b4..3e8e72497 100644 --- a/pkg/koordlet/resmanager/resctrl_reconcile_test.go +++ b/pkg/koordlet/resmanager/resctrl_reconcile_test.go @@ -420,7 +420,7 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { group string cbm uint l3Num int - qosStrategy *slov1alpha1.ResourceQoSStrategy + qosStrategy *slov1alpha1.ResourceQOSStrategy } type field struct { invalidPath bool @@ -444,10 +444,10 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { group: LSResctrlGroup, cbm: 0xf, l3Num: 2, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, @@ -465,10 +465,10 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { group: LSResctrlGroup, cbm: 0xf, l3Num: 2, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, @@ -485,10 +485,10 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { group: LSResctrlGroup, cbm: 0x4, l3Num: 2, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, @@ -505,10 +505,10 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { group: LSResctrlGroup, cbm: 0xf, l3Num: 2, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, @@ -525,18 +525,18 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { group: LSResctrlGroup, cbm: 0x7ff, l3Num: 1, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(10), CATRangeEndPercent: pointer.Int64Ptr(50), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, @@ -553,18 +553,18 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { group: LSRResctrlGroup, cbm: 0x7ff, l3Num: 1, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(10), CATRangeEndPercent: pointer.Int64Ptr(50), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, @@ -581,18 +581,18 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { group: BEResctrlGroup, cbm: 0x7ff, l3Num: 1, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(10), CATRangeEndPercent: pointer.Int64Ptr(50), }, @@ -640,7 +640,7 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) { // execute function err := r.calculateAndApplyCatL3PolicyForGroup(tt.args.group, tt.args.cbm, tt.args.l3Num, - getResourceQoSForResctrlGroup(tt.args.qosStrategy, tt.args.group)) + getResourceQOSForResctrlGroup(tt.args.qosStrategy, tt.args.group)) assert.Equal(t, tt.wantErr, err != nil) schemataPath := filepath.Join(validSysFSRootDir, system.ResctrlDir, tt.args.group, system.SchemataFileName) @@ -654,7 +654,7 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) { type args struct { group string l3Num int - qosStrategy *slov1alpha1.ResourceQoSStrategy + qosStrategy *slov1alpha1.ResourceQOSStrategy } type field struct { invalidPath bool @@ -677,10 +677,10 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) { args: args{ group: LSResctrlGroup, l3Num: 2, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ MBAPercent: pointer.Int64Ptr(90), }, }, @@ -696,10 +696,10 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) { args: args{ group: LSResctrlGroup, l3Num: 2, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ MBAPercent: pointer.Int64Ptr(90), }, }, @@ -714,10 +714,10 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) { args: args{ group: LSResctrlGroup, l3Num: 2, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ MBAPercent: pointer.Int64Ptr(90), }, }, @@ -732,17 +732,17 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) { args: args{ group: BEResctrlGroup, l3Num: 2, - qosStrategy: &slov1alpha1.ResourceQoSStrategy{ - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + qosStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ MBAPercent: pointer.Int64Ptr(100), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ MBAPercent: pointer.Int64Ptr(90), }, }, @@ -789,7 +789,7 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) { // execute function err := r.calculateAndApplyCatMbPolicyForGroup(tt.args.group, tt.args.l3Num, - getResourceQoSForResctrlGroup(tt.args.qosStrategy, tt.args.group)) + getResourceQOSForResctrlGroup(tt.args.qosStrategy, tt.args.group)) assert.Equal(t, tt.wantErr, err != nil) schemataPath := filepath.Join(validSysFSRootDir, system.ResctrlDir, tt.args.group, system.SchemataFileName) @@ -901,27 +901,27 @@ func TestResctrlReconcile_reconcileCatResctrlPolicy(t *testing.T) { nodeSLO := &slov1alpha1.NodeSLO{ Spec: slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), MBAPercent: pointer.Int64Ptr(90), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(30), }, @@ -949,7 +949,7 @@ func TestResctrlReconcile_reconcileCatResctrlPolicy(t *testing.T) { defer func() { stop <- struct{}{} }() // reconcile and check if the result is correct - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) beSchemataPath := filepath.Join(resctrlDirPath, BEResctrlGroup, system.SchemataFileName) expectBESchemataStr := "L3:0=f;1=f;\n" @@ -964,11 +964,11 @@ func TestResctrlReconcile_reconcileCatResctrlPolicy(t *testing.T) { // log error for invalid be resctrl path err = os.RemoveAll(filepath.Join(resctrlDirPath, BEResctrlGroup)) assert.NoError(t, err) - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) // log error for invalid root resctrl path system.Conf.SysFSRootDir = "invalidPath" - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) system.Conf.SysFSRootDir = validSysFSRootDir // log error for invalid l3 number @@ -976,27 +976,27 @@ func TestResctrlReconcile_reconcileCatResctrlPolicy(t *testing.T) { BasicInfo: util.CPUBasicInfo{CatL3CbmMask: "7ff"}, TotalInfo: util.CPUTotalInfo{NumberL3s: -1}, }, nil).Times(1) - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) // log error for invalid l3 cbm metricCache.EXPECT().GetNodeCPUInfo(&metriccache.QueryParam{}).Return(&metriccache.NodeCPUInfo{ BasicInfo: util.CPUBasicInfo{CatL3CbmMask: "invalid"}, TotalInfo: util.CPUTotalInfo{NumberL3s: 2}, }, nil).Times(1) - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) metricCache.EXPECT().GetNodeCPUInfo(&metriccache.QueryParam{}).Return(&metriccache.NodeCPUInfo{ BasicInfo: util.CPUBasicInfo{CatL3CbmMask: ""}, TotalInfo: util.CPUTotalInfo{NumberL3s: 2}, }, nil).Times(1) - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) // log error for invalid nodeCPUInfo metricCache.EXPECT().GetNodeCPUInfo(&metriccache.QueryParam{}).Return(nil, nil) - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) // log error for get nodeCPUInfo failed metricCache.EXPECT().GetNodeCPUInfo(&metriccache.QueryParam{}).Return(nil, fmt.Errorf("error")) - r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQoSStrategy) + r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy) }) } @@ -1034,8 +1034,8 @@ func TestResctrlReconcile_reconcileResctrlGroups(t *testing.T) { }, CgroupDir: "p0", } - testQOSStrategy := util.DefaultResourceQoSStrategy() - testQOSStrategy.BE.ResctrlQoS.Enable = pointer.BoolPtr(true) + testQOSStrategy := util.DefaultResourceQOSStrategy() + testQOSStrategy.BEClass.ResctrlQOS.Enable = pointer.BoolPtr(true) t.Run("test", func(t *testing.T) { // initialization @@ -1095,26 +1095,26 @@ func TestResctrlReconcile_reconcile(t *testing.T) { testingNodeSLO := &slov1alpha1.NodeSLO{ Spec: slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(30), }, @@ -1206,7 +1206,7 @@ func TestResctrlReconcile_reconcile(t *testing.T) { r.reconcile() // test strategy parse error - testingNodeSLO.Spec.ResourceQoSStrategy = nil + testingNodeSLO.Spec.ResourceQOSStrategy = nil statesInformer.EXPECT().GetNodeSLO().Return(testingNodeSLO).AnyTimes() r.reconcile() diff --git a/pkg/koordlet/resmanager/resmanager.go b/pkg/koordlet/resmanager/resmanager.go index b76b48b1e..84dc3910f 100644 --- a/pkg/koordlet/resmanager/resmanager.go +++ b/pkg/koordlet/resmanager/resmanager.go @@ -75,7 +75,7 @@ func NewResManager(cfg *Config, schema *apiruntime.Scheme, kubeClient clientset. eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartRecordingToSink(&clientcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) - recorder := eventBroadcaster.NewRecorder(schema, corev1.EventSource{Component: "slo-agent-reporter", Host: nodeName}) + recorder := eventBroadcaster.NewRecorder(schema, corev1.EventSource{Component: "koordlet-resmanager", Host: nodeName}) r := &resmanager{ config: cfg, diff --git a/pkg/koordlet/runtimehooks/config.go b/pkg/koordlet/runtimehooks/config.go index 7ec9e10e5..4e595835f 100644 --- a/pkg/koordlet/runtimehooks/config.go +++ b/pkg/koordlet/runtimehooks/config.go @@ -24,11 +24,13 @@ import ( cliflag "k8s.io/component-base/cli/flag" "k8s.io/component-base/featuregate" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/cpuset" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/groupidentity" ) const ( - GroupIdentity featuregate.Feature = "GroupIdentity" + GroupIdentity featuregate.Feature = "GroupIdentity" + CPUSetAllocator featuregate.Feature = "CPUSetAllocator" ) var ( @@ -36,11 +38,13 @@ var ( DefaultRuntimeHooksFG featuregate.FeatureGate = DefaultMutableRuntimeHooksFG defaultRuntimeHooksFG = map[featuregate.Feature]featuregate.FeatureSpec{ - GroupIdentity: {Default: false, PreRelease: featuregate.Alpha}, + GroupIdentity: {Default: false, PreRelease: featuregate.Alpha}, + CPUSetAllocator: {Default: false, PreRelease: featuregate.Alpha}, } runtimeHookPlugins = map[featuregate.Feature]HookPlugin{ - GroupIdentity: groupidentity.Object(), + GroupIdentity: groupidentity.Object(), + CPUSetAllocator: cpuset.Object(), } ) @@ -59,8 +63,8 @@ func NewDefaultConfig() *Config { } func (c *Config) InitFlags(fs *flag.FlagSet) { - fs.StringVar(&c.RuntimeHooksNetwork, "RuntimeHooksNetwork", c.RuntimeHooksNetwork, "rpc server network type for runtime hooks") - fs.StringVar(&c.RuntimeHooksAddr, "RuntimeHooksAddr", c.RuntimeHooksAddr, "rpc server address for runtime hooks") + fs.StringVar(&c.RuntimeHooksNetwork, "runtime-hooks-network", c.RuntimeHooksNetwork, "rpc server network type for runtime hooks") + fs.StringVar(&c.RuntimeHooksAddr, "runtime-hooks-addr", c.RuntimeHooksAddr, "rpc server address for runtime hooks") fs.Var(cliflag.NewMapStringBool(&c.FeatureGates), "runtime-hooks", "A set of key=value pairs that describe feature gates for runtime hooks alpha/experimental features. "+ "Options are:\n"+strings.Join(DefaultRuntimeHooksFG.KnownFeatures(), "\n")) diff --git a/pkg/koordlet/runtimehooks/config_test.go b/pkg/koordlet/runtimehooks/config_test.go new file mode 100644 index 000000000..ec970dea7 --- /dev/null +++ b/pkg/koordlet/runtimehooks/config_test.go @@ -0,0 +1,40 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runtimehooks + +import ( + "flag" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_NewDefaultConfig(t *testing.T) { + expectConfig := &Config{ + RuntimeHooksNetwork: "tcp", + RuntimeHooksAddr: ":9318", + FeatureGates: map[string]bool{}, + } + defaultConfig := NewDefaultConfig() + assert.Equal(t, expectConfig, defaultConfig) +} + +func Test_InitFlags(t *testing.T) { + cfg := NewDefaultConfig() + cfg.InitFlags(flag.CommandLine) + flag.Parse() +} diff --git a/pkg/koordlet/runtimehooks/hooks/cpuset/cpuset.go b/pkg/koordlet/runtimehooks/hooks/cpuset/cpuset.go new file mode 100644 index 000000000..2ce7bac6f --- /dev/null +++ b/pkg/koordlet/runtimehooks/hooks/cpuset/cpuset.go @@ -0,0 +1,102 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cpuset + +import ( + "fmt" + "sync" + + "k8s.io/klog/v2" + "k8s.io/utils/pointer" + + ext "github.com/koordinator-sh/koordinator/apis/extension" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/reconciler" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/rule" + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" + rmconfig "github.com/koordinator-sh/koordinator/pkg/runtimeproxy/config" + sysutil "github.com/koordinator-sh/koordinator/pkg/util/system" +) + +const ( + name = "CPUSetAllocator" + description = "set cpuset value by pod allocation" +) + +type cpusetPlugin struct { + rule *cpusetRule + ruleRWMutex sync.RWMutex +} + +func (p *cpusetPlugin) Register() { + klog.V(5).Infof("register hook %v", name) + hooks.Register(rmconfig.PreStartContainer, name, description, p.SetContainerCPUSet) + rule.Register(name, description, + rule.WithParseFunc(statesinformer.RegisterTypeNodeTopology, p.parseRule), + rule.WithUpdateCallback(p.ruleUpdateCb)) + reconciler.RegisterCgroupReconciler(reconciler.ContainerLevel, sysutil.CPUSet, p.SetContainerCPUSet, + "set container cpuset") +} + +var singleton *cpusetPlugin + +func Object() *cpusetPlugin { + if singleton == nil { + singleton = &cpusetPlugin{} + } + return singleton +} + +func (p *cpusetPlugin) SetContainerCPUSet(proto protocol.HooksProtocol) error { + containerCtx := proto.(*protocol.ContainerContext) + if containerCtx == nil { + return fmt.Errorf("container protocol is nil for plugin %v", name) + } + containerReq := containerCtx.Request + + // cpuset from pod annotation + if cpusetVal, err := getCPUSetFromPod(containerReq.PodAnnotations); err != nil { + return err + } else if cpusetVal != "" { + containerCtx.Response.Resources.CPUSet = pointer.StringPtr(cpusetVal) + return nil + } + + // use cpushare pool for pod + r := p.getRule() + if r == nil { + klog.V(5).Infof("hook plugin rule is nil, nothing to do for plugin %v", name) + return nil + } + cpusetValue, err := r.getContainerCPUSet(&containerReq) + if err != nil { + return err + } + if cpusetValue != "" { + containerCtx.Response.Resources.CPUSet = pointer.StringPtr(cpusetValue) + } + return nil +} + +func getCPUSetFromPod(podAnnotations map[string]string) (string, error) { + podAlloc, err := ext.GetResourceStatus(podAnnotations) + if err != nil { + return "", err + } + return podAlloc.CPUSet, nil +} diff --git a/pkg/koordlet/runtimehooks/hooks/cpuset/cpuset_test.go b/pkg/koordlet/runtimehooks/hooks/cpuset/cpuset_test.go new file mode 100644 index 000000000..a691d61ef --- /dev/null +++ b/pkg/koordlet/runtimehooks/hooks/cpuset/cpuset_test.go @@ -0,0 +1,267 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cpuset + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/utils/pointer" + + ext "github.com/koordinator-sh/koordinator/apis/extension" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol" + "github.com/koordinator-sh/koordinator/pkg/util" + "github.com/koordinator-sh/koordinator/pkg/util/system" +) + +func initCPUSet(dirWithKube string, value string, helper *system.FileTestUtil) { + helper.WriteCgroupFileContents(dirWithKube, system.CPUSet, value) +} + +func getCPUSet(dirWithKube string, helper *system.FileTestUtil) string { + return helper.ReadCgroupFileContents(dirWithKube, system.CPUSet) +} + +func Test_cpusetPlugin_SetContainerCPUSet(t *testing.T) { + type fields struct { + rule *cpusetRule + } + type args struct { + podAlloc *ext.ResourceStatus + proto protocol.HooksProtocol + } + tests := []struct { + name string + fields fields + args args + wantErr bool + wantCPUSet *string + }{ + { + name: "set cpu with nil protocol", + fields: fields{ + rule: nil, + }, + args: args{ + proto: nil, + }, + wantErr: true, + wantCPUSet: nil, + }, + { + name: "set cpu by bad pod allocated format", + fields: fields{ + rule: nil, + }, + args: args{ + proto: &protocol.ContainerContext{ + Request: protocol.ContainerRequest{ + CgroupParent: "kubepods/test-pod/test-container/", + PodAnnotations: map[string]string{ + ext.AnnotationResourceStatus: "bad-format", + }, + }, + }, + }, + wantErr: true, + wantCPUSet: nil, + }, + { + name: "set cpu by pod allocated", + fields: fields{ + rule: nil, + }, + args: args{ + podAlloc: &ext.ResourceStatus{ + CPUSet: "2-4", + }, + proto: &protocol.ContainerContext{ + Request: protocol.ContainerRequest{ + CgroupParent: "kubepods/test-pod/test-container/", + }, + }, + }, + wantErr: false, + wantCPUSet: pointer.StringPtr("2-4"), + }, + { + name: "set cpu by pod allocated share pool with nil rule", + fields: fields{ + rule: nil, + }, + args: args{ + podAlloc: &ext.ResourceStatus{ + CPUSharedPools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + }, + }, + }, + proto: &protocol.ContainerContext{ + Request: protocol.ContainerRequest{ + CgroupParent: "kubepods/test-pod/test-container/", + }, + }, + }, + wantErr: false, + wantCPUSet: nil, + }, + { + name: "set cpu by pod allocated share pool", + fields: fields{ + rule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + }, + args: args{ + podAlloc: &ext.ResourceStatus{ + CPUSharedPools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + }, + }, + }, + proto: &protocol.ContainerContext{ + Request: protocol.ContainerRequest{ + CgroupParent: "kubepods/test-pod/test-container/", + }, + }, + }, + wantErr: false, + wantCPUSet: pointer.StringPtr("0-7"), + }, + { + name: "set cpu for origin besteffort pod", + fields: fields{ + rule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + }, + args: args{ + proto: &protocol.ContainerContext{ + Request: protocol.ContainerRequest{ + CgroupParent: "kubepods/besteffort/test-pod/test-container/", + }, + }, + }, + wantErr: false, + wantCPUSet: nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testHelper := system.NewFileTestUtil(t) + var containerCtx *protocol.ContainerContext + + p := &cpusetPlugin{ + rule: tt.fields.rule, + } + if tt.args.proto != nil { + containerCtx = tt.args.proto.(*protocol.ContainerContext) + initCPUSet(containerCtx.Request.CgroupParent, "", testHelper) + if tt.args.podAlloc != nil { + podAllocJson := util.DumpJSON(tt.args.podAlloc) + containerCtx.Request.PodAnnotations = map[string]string{ + ext.AnnotationResourceStatus: podAllocJson, + } + } + } + + err := p.SetContainerCPUSet(containerCtx) + if (err != nil) != tt.wantErr { + t.Errorf("SetContainerCPUSet() error = %v, wantErr %v", err, tt.wantErr) + } + + if containerCtx == nil { + return + } + if tt.wantCPUSet == nil { + assert.Nil(t, containerCtx.Response.Resources.CPUSet, "cpuset value should be nil") + } else { + containerCtx.ReconcilerDone() + assert.Equal(t, *tt.wantCPUSet, *containerCtx.Response.Resources.CPUSet, "container cpuset should be equal") + gotCPUSet := getCPUSet(containerCtx.Request.CgroupParent, testHelper) + assert.Equal(t, *tt.wantCPUSet, gotCPUSet, "container cpuset should be equal") + } + }) + } +} + +func Test_getCPUSetFromPod(t *testing.T) { + type args struct { + podAnnotations map[string]string + podAlloc *ext.ResourceStatus + } + tests := []struct { + name string + args args + want string + wantErr bool + }{ + { + name: "get cpuset from annotation", + args: args{ + podAnnotations: map[string]string{}, + podAlloc: &ext.ResourceStatus{ + CPUSet: "2-4", + }, + }, + want: "2-4", + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.args.podAlloc != nil { + podAllocJson := util.DumpJSON(tt.args.podAlloc) + tt.args.podAnnotations[ext.AnnotationResourceStatus] = podAllocJson + } + got, err := getCPUSetFromPod(tt.args.podAnnotations) + if (err != nil) != tt.wantErr { + t.Errorf("getCPUSetFromPod() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("getCPUSetFromPod() got = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/koordlet/runtimehooks/hooks/cpuset/rule.go b/pkg/koordlet/runtimehooks/hooks/cpuset/rule.go new file mode 100644 index 000000000..f5357834a --- /dev/null +++ b/pkg/koordlet/runtimehooks/hooks/cpuset/rule.go @@ -0,0 +1,127 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cpuset + +import ( + "fmt" + "reflect" + "strings" + + topov1alpha1 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha1" + "github.com/koordinator-sh/koordinator/pkg/util" + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + + ext "github.com/koordinator-sh/koordinator/apis/extension" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol" + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" +) + +type cpusetRule struct { + sharePools []ext.CPUSharedPool +} + +func (r *cpusetRule) getContainerCPUSet(containerReq *protocol.ContainerRequest) (string, error) { + if containerReq == nil { + return "", nil + } + podAnnotations := containerReq.PodAnnotations + podLabels := containerReq.PodLabels + podAlloc, err := ext.GetResourceStatus(podAnnotations) + if err != nil { + return "", err + } + + if len(podAlloc.CPUSharedPools) != 0 { + // pod specified cpu share pool + cpusetList := make([]string, 0, len(podAlloc.CPUSharedPools)) + for _, specifiedSharePool := range podAlloc.CPUSharedPools { + for _, nodeSharePool := range r.sharePools { + if specifiedSharePool.Socket == nodeSharePool.Socket && specifiedSharePool.Node == nodeSharePool.Node { + cpusetList = append(cpusetList, nodeSharePool.CPUSet) + } + } + } + return strings.Join(cpusetList, ","), nil + } + + kubeQOS := util.GetKubeQoSByCgroupParent(containerReq.CgroupParent) + podQOSClass := ext.GetQoSClassByLabels(podLabels) + isKubeOriginLS := podQOSClass == ext.QoSNone && kubeQOS != corev1.PodQOSBestEffort + if podQOSClass == ext.QoSLS || isKubeOriginLS { + // LS pod which does not specified cpu, bind all cpu share pool + // TODO use dynamic binding policy in the future + allSharePoolCPUs := make([]string, 0, len(r.sharePools)) + for _, nodeSharePool := range r.sharePools { + allSharePoolCPUs = append(allSharePoolCPUs, nodeSharePool.CPUSet) + } + return strings.Join(allSharePoolCPUs, ","), nil + } + return "", nil +} + +func (p *cpusetPlugin) parseRule(nodeTopoIf interface{}) (bool, error) { + nodeTopo, ok := nodeTopoIf.(*topov1alpha1.NodeResourceTopology) + if !ok { + return false, fmt.Errorf("parse format for hook plugin %v failed, expect: %v, got: %T", + name, "*topov1alpha1.NodeResourceTopology", nodeTopoIf) + } + cpuSharePools, err := ext.GetNodeCPUSharePools(nodeTopo.Annotations) + if err != nil { + return false, err + } + newRule := &cpusetRule{ + sharePools: cpuSharePools, + } + updated := p.updateRule(newRule) + return updated, nil +} + +func (p *cpusetPlugin) ruleUpdateCb(pods []*statesinformer.PodMeta) error { + for _, podMeta := range pods { + for _, containerStat := range podMeta.Pod.Status.ContainerStatuses { + containerCtx := &protocol.ContainerContext{} + containerCtx.FromReconciler(podMeta, containerStat.Name) + if err := p.SetContainerCPUSet(containerCtx); err != nil { + klog.Infof("parse cpuset from pod annotation failed during callback, error: %v", err) + continue + } + containerCtx.ReconcilerDone() + } + } + return nil +} + +func (p *cpusetPlugin) getRule() *cpusetRule { + p.ruleRWMutex.RLock() + defer p.ruleRWMutex.RUnlock() + if p.rule == nil { + return nil + } + rule := *p.rule + return &rule +} + +func (p *cpusetPlugin) updateRule(newRule *cpusetRule) bool { + p.ruleRWMutex.RLock() + defer p.ruleRWMutex.RUnlock() + if !reflect.DeepEqual(newRule, p.rule) { + p.rule = newRule + return true + } + return false +} diff --git a/pkg/koordlet/runtimehooks/hooks/cpuset/rule_test.go b/pkg/koordlet/runtimehooks/hooks/cpuset/rule_test.go new file mode 100644 index 000000000..b0e621d26 --- /dev/null +++ b/pkg/koordlet/runtimehooks/hooks/cpuset/rule_test.go @@ -0,0 +1,595 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cpuset + +import ( + "testing" + + topov1alpha1 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha1" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + ext "github.com/koordinator-sh/koordinator/apis/extension" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol" + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" + "github.com/koordinator-sh/koordinator/pkg/util" + "github.com/koordinator-sh/koordinator/pkg/util/system" +) + +func Test_cpusetRule_getContainerCPUSet(t *testing.T) { + type fields struct { + sharePools []ext.CPUSharedPool + } + type args struct { + podAlloc *ext.ResourceStatus + containerReq *protocol.ContainerRequest + } + tests := []struct { + name string + fields fields + args args + want string + wantErr bool + }{ + { + name: "get cpuset from bad annotation", + fields: fields{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + }, + }, + args: args{ + containerReq: &protocol.ContainerRequest{ + PodMeta: protocol.PodMeta{}, + ContainerMeta: protocol.ContainerMeta{}, + PodLabels: map[string]string{}, + PodAnnotations: map[string]string{ + ext.AnnotationResourceStatus: "bad-alloc-fmt", + }, + CgroupParent: "burstable/test-pod/test-container", + }, + }, + want: "", + wantErr: true, + }, + { + name: "get cpuset from annotation share pool", + fields: fields{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + args: args{ + containerReq: &protocol.ContainerRequest{ + PodMeta: protocol.PodMeta{}, + ContainerMeta: protocol.ContainerMeta{}, + PodLabels: map[string]string{}, + PodAnnotations: map[string]string{}, + CgroupParent: "burstable/test-pod/test-container", + }, + podAlloc: &ext.ResourceStatus{ + CPUSharedPools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + }, + }, + }, + }, + want: "0-7", + wantErr: false, + }, + { + name: "get all share pools for ls pod", + fields: fields{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + args: args{ + containerReq: &protocol.ContainerRequest{ + PodMeta: protocol.PodMeta{}, + ContainerMeta: protocol.ContainerMeta{}, + PodLabels: map[string]string{ + ext.LabelPodQoS: string(ext.QoSLS), + }, + PodAnnotations: map[string]string{}, + CgroupParent: "burstable/test-pod/test-container", + }, + }, + want: "0-7,8-15", + wantErr: false, + }, + { + name: "get all share pools for origin burstable pod", + fields: fields{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + args: args{ + containerReq: &protocol.ContainerRequest{ + PodMeta: protocol.PodMeta{}, + ContainerMeta: protocol.ContainerMeta{}, + PodLabels: map[string]string{}, + PodAnnotations: map[string]string{}, + CgroupParent: "burstable/test-pod/test-container", + }, + }, + want: "0-7,8-15", + wantErr: false, + }, + { + name: "nothing for origin besteffort pod", + fields: fields{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + args: args{ + containerReq: &protocol.ContainerRequest{ + PodMeta: protocol.PodMeta{}, + ContainerMeta: protocol.ContainerMeta{}, + PodLabels: map[string]string{}, + PodAnnotations: map[string]string{}, + CgroupParent: "besteffort/test-pod/test-container", + }, + }, + want: "", + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := &cpusetRule{ + sharePools: tt.fields.sharePools, + } + if tt.args.podAlloc != nil { + podAllocJson := util.DumpJSON(tt.args.podAlloc) + tt.args.containerReq.PodAnnotations[ext.AnnotationResourceStatus] = podAllocJson + } + got, err := r.getContainerCPUSet(tt.args.containerReq) + if (err != nil) != tt.wantErr { + t.Errorf("getCPUSet() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("getCPUSet() got = %v, wantUpdated %v", got, tt.want) + } + }) + } + // node.koordinator.sh/cpu-shared-pools: '[{"cpuset":"2-7"}]' + // scheduling.koordinator.sh/resource-status: '{"cpuset":"0-1"}' +} + +func Test_cpusetPlugin_parseRuleBadIf(t *testing.T) { + type fields struct { + rule *cpusetRule + } + type args struct { + nodeTopo interface{} + } + tests := []struct { + name string + fields fields + args args + wantUpdated bool + wantRule *cpusetRule + wantErr bool + }{ + { + name: "update rule with bad format", + fields: fields{ + rule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + }, + args: args{ + nodeTopo: corev1.Pod{}, + }, + wantUpdated: false, + wantRule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := &cpusetPlugin{ + rule: tt.fields.rule, + } + got, err := p.parseRule(tt.args.nodeTopo) + if (err != nil) != tt.wantErr { + t.Errorf("parseRule() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.wantUpdated { + t.Errorf("parseRule() got = %v, wantUpdated %v", got, tt.wantUpdated) + } + assert.Equal(t, tt.wantRule, p.rule, "after plugin rule parse") + }) + } +} + +func Test_cpusetPlugin_parseRule(t *testing.T) { + type fields struct { + rule *cpusetRule + } + type args struct { + nodeTopo *topov1alpha1.NodeResourceTopology + sharePools []ext.CPUSharedPool + } + tests := []struct { + name string + fields fields + args args + wantUpdated bool + wantRule *cpusetRule + wantErr bool + }{ + { + name: "update rule with bad format", + fields: fields{ + rule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + }, + args: args{ + nodeTopo: &topov1alpha1.NodeResourceTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Annotations: map[string]string{ + ext.AnnotationNodeCPUSharedPools: "bad-fmt", + }, + }, + }, + }, + wantUpdated: false, + wantRule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + wantErr: true, + }, + { + name: "update rule with same", + fields: fields{ + rule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + }, + args: args{ + nodeTopo: &topov1alpha1.NodeResourceTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + }, + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + wantUpdated: false, + wantRule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + wantErr: false, + }, + { + name: "update rule success", + fields: fields{ + rule: nil, + }, + args: args{ + nodeTopo: &topov1alpha1.NodeResourceTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + }, + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + wantUpdated: true, + wantRule: &cpusetRule{ + sharePools: []ext.CPUSharedPool{ + { + Socket: 0, + Node: 0, + CPUSet: "0-7", + }, + { + Socket: 1, + Node: 0, + CPUSet: "8-15", + }, + }, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := &cpusetPlugin{ + rule: tt.fields.rule, + } + if len(tt.args.sharePools) != 0 { + sharePoolJson := util.DumpJSON(tt.args.sharePools) + tt.args.nodeTopo.Annotations = map[string]string{ + ext.AnnotationNodeCPUSharedPools: sharePoolJson, + } + } + got, err := p.parseRule(tt.args.nodeTopo) + if (err != nil) != tt.wantErr { + t.Errorf("parseRule() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.wantUpdated { + t.Errorf("parseRule() got = %v, wantUpdated %v", got, tt.wantUpdated) + } + assert.Equal(t, tt.wantRule, p.rule, "after plugin rule parse") + }) + } +} + +func Test_cpusetPlugin_ruleUpdateCb(t *testing.T) { + type args struct { + pods []*statesinformer.PodMeta + podAllocs map[string]ext.ResourceStatus + } + type wants struct { + containersCPUSet map[string]string + } + tests := []struct { + name string + args args + wants wants + wantErr bool + }{ + { + name: "set container cpuset", + args: args{ + pods: []*statesinformer.PodMeta{ + { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "pod-with-cpuset-alloc-uid", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "container-with-cpuset-alloc-name", + }, + }, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "container-with-cpuset-alloc-name", + ContainerID: "containerd://container-with-cpuset-alloc-uid", + }, + }, + }, + }, + }, + { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "pod-with-bad-cpuset-alloc-uid", + Annotations: map[string]string{ + ext.AnnotationResourceStatus: "bad-format", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "container-with-bad-cpuset-alloc-name", + }, + }, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "container-with-bad-cpuset-alloc-name", + ContainerID: "containerd://container-with-bad-cpuset-alloc-uid", + }, + }, + }, + }, + }, + }, + podAllocs: map[string]ext.ResourceStatus{ + "pod-with-cpuset-alloc-uid": { + CPUSet: "2-4", + }, + }, + }, + wants: wants{ + containersCPUSet: map[string]string{ + "container-with-cpuset-alloc-name": "2-4", + "container-with-bad-cpuset-alloc-name": "", + }, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testHelper := system.NewFileTestUtil(t) + // init cgroups cpuset file + for _, podMeta := range tt.args.pods { + for _, containerStat := range podMeta.Pod.Status.ContainerStatuses { + containerPath, err := util.GetContainerCgroupPathWithKubeByID(podMeta.CgroupDir, containerStat.ContainerID) + assert.NoError(t, err, "get contaienr cgorup path during init container cpuset") + initCPUSet(containerPath, "", testHelper) + } + } + + // init pod annotations + for _, podMeta := range tt.args.pods { + podUID := string(podMeta.Pod.UID) + podAlloc, exist := tt.args.podAllocs[podUID] + if !exist { + continue + } + podAllocJson := util.DumpJSON(podAlloc) + podMeta.Pod.Annotations = map[string]string{ + ext.AnnotationResourceStatus: podAllocJson, + } + } + + p := &cpusetPlugin{} + if err := p.ruleUpdateCb(tt.args.pods); (err != nil) != tt.wantErr { + t.Errorf("ruleUpdateCb() error = %v, wantErr %v", err, tt.wantErr) + } + + for _, podMeta := range tt.args.pods { + for _, containerStat := range podMeta.Pod.Status.ContainerStatuses { + containerPath, err := util.GetContainerCgroupPathWithKubeByID(podMeta.CgroupDir, containerStat.ContainerID) + assert.NoError(t, err, "get contaienr cgorup path during check container cpuset") + gotCPUSEt := getCPUSet(containerPath, testHelper) + assert.Equal(t, tt.wants.containersCPUSet[containerStat.Name], gotCPUSEt, + "cpuset after callback should be equal") + } + } + }) + } +} diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt.go index 7b937b2c2..64c333f6a 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt.go @@ -24,7 +24,9 @@ import ( "k8s.io/utils/pointer" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/reconciler" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/rule" + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" rmconfig "github.com/koordinator-sh/koordinator/pkg/runtimeproxy/config" "github.com/koordinator-sh/koordinator/pkg/util" sysutil "github.com/koordinator-sh/koordinator/pkg/util/system" @@ -45,13 +47,13 @@ func (b *bvtPlugin) Register() { klog.V(5).Infof("register hook %v", name) hooks.Register(rmconfig.PreRunPodSandbox, name, description, b.SetPodBvtValue) rule.Register(name, description, - rule.WithParseFunc(b.parseRule), + rule.WithParseFunc(statesinformer.RegisterTypeNodeSLOSpec, b.parseRule), rule.WithUpdateCallback(b.ruleUpdateCb), rule.WithSystemSupported(b.SystemSupported)) - //reconciler.RegisterCgroupReconciler(reconciler.PodLevel, sysutil.CPUBVTWarpNs, b.SetPodBvtValue, - // "reconcile pod level cpu bvt value") - //reconciler.RegisterCgroupReconciler(reconciler.KubeQOSLevel, sysutil.CPUBVTWarpNs, b.SetKubeQOSBvtValue, - // "reconcile kubeqos level cpu bvt value") + reconciler.RegisterCgroupReconciler(reconciler.PodLevel, sysutil.CPUBVTWarpNs, b.SetPodBvtValue, + "reconcile pod level cpu bvt value") + reconciler.RegisterCgroupReconciler(reconciler.KubeQOSLevel, sysutil.CPUBVTWarpNs, b.SetKubeQOSBvtValue, + "reconcile kubeqos level cpu bvt value") } func (b *bvtPlugin) SystemSupported() bool { diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor.go index 1b4b13fd8..70edfb96f 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor.go @@ -31,11 +31,15 @@ func (b *bvtPlugin) SetPodBvtValue(p protocol.HooksProtocol) error { return nil } r := b.getRule() + if r == nil { + klog.V(5).Infof("hook plugin rule is nil, nothing to do for plugin %v", name) + return nil + } podCtx := p.(*protocol.PodContext) req := podCtx.Request - podQoS := ext.GetQoSClassByLabels(req.Labels) - podKubeQoS := util.GetKubeQoSByCgroupParent(req.CgroupParent) - podBvt := r.getPodBvtValue(podQoS, podKubeQoS) + podQOS := ext.GetQoSClassByLabels(req.Labels) + podKubeQOS := util.GetKubeQoSByCgroupParent(req.CgroupParent) + podBvt := r.getPodBvtValue(podQOS, podKubeQOS) podCtx.Response.Resources.CPUBvt = pointer.Int64(podBvt) return nil } @@ -46,9 +50,13 @@ func (b *bvtPlugin) SetKubeQOSBvtValue(p protocol.HooksProtocol) error { return nil } r := b.getRule() + if r == nil { + klog.V(5).Infof("hook plugin rule is nil, nothing to do for plugin %v", name) + return nil + } kubeQOSCtx := p.(*protocol.KubeQOSContext) req := kubeQOSCtx.Request - bvtValue := r.getKubeQoSDirBvtValue(req.KubeQOSClass) + bvtValue := r.getKubeQOSDirBvtValue(req.KubeQOSClass) kubeQOSCtx.Response.Resources.CPUBvt = pointer.Int64(bvtValue) return nil } diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor_test.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor_test.go index 3c430dc29..e3f0aac40 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor_test.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor_test.go @@ -49,6 +49,7 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { }, } type fields struct { + rule *bvtRule systemSupported *bool } type args struct { @@ -67,6 +68,7 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { { name: "set ls pod bvt", fields: fields{ + rule: defaultRule, systemSupported: pointer.Bool(true), }, args: args{ @@ -85,6 +87,7 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { { name: "set be pod bvt", fields: fields{ + rule: defaultRule, systemSupported: pointer.Bool(true), }, args: args{ @@ -103,6 +106,7 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { { name: "set be pod bvt but system not support", fields: fields{ + rule: defaultRule, systemSupported: pointer.Bool(false), }, args: args{ @@ -118,6 +122,25 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { bvtValue: nil, }, }, + { + name: "set be pod bvt but rule is nil", + fields: fields{ + rule: nil, + systemSupported: pointer.Bool(true), + }, + args: args{ + request: &runtimeapi.PodSandboxHookRequest{ + Labels: map[string]string{ + ext.LabelPodQoS: string(ext.QoSBE), + }, + CgroupParent: "kubepods/besteffort/pod-besteffort-test-uid/", + }, + response: &runtimeapi.PodSandboxHookResponse{}, + }, + want: want{ + bvtValue: nil, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -125,7 +148,7 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { initCPUBvt(tt.args.request.CgroupParent, 0, testHelper) b := &bvtPlugin{ - rule: defaultRule, + rule: tt.fields.rule, sysSupported: tt.fields.systemSupported, } ctx := &protocol.PodContext{} @@ -164,6 +187,7 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { }, } type fields struct { + rule *bvtRule sysSupported *bool } type args struct { @@ -181,6 +205,7 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { { name: "set guaranteed dir bvt", fields: fields{ + rule: defaultRule, sysSupported: pointer.BoolPtr(true), }, args: args{ @@ -193,6 +218,7 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { { name: "set burstable dir bvt", fields: fields{ + rule: defaultRule, sysSupported: pointer.BoolPtr(true), }, args: args{ @@ -205,6 +231,7 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { { name: "set be dir bvt", fields: fields{ + rule: defaultRule, sysSupported: pointer.BoolPtr(true), }, args: args{ @@ -217,6 +244,7 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { { name: "set be dir bvt but system not support", fields: fields{ + rule: defaultRule, sysSupported: pointer.BoolPtr(false), }, args: args{ @@ -226,6 +254,19 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { bvtValue: nil, }, }, + { + name: "set be dir bvt but rule is nil", + fields: fields{ + rule: nil, + sysSupported: pointer.BoolPtr(true), + }, + args: args{ + kubeQOS: corev1.PodQOSBestEffort, + }, + want: want{ + bvtValue: nil, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -234,7 +275,7 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { initCPUBvt(kubeQOSDir, 0, testHelper) b := &bvtPlugin{ - rule: defaultRule, + rule: tt.fields.rule, sysSupported: tt.fields.sysSupported, } ctx := &protocol.KubeQOSContext{} diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go index b5ce17390..2ba90562a 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go @@ -37,28 +37,30 @@ type bvtRule struct { kubeQOSPodParams map[corev1.PodQOSClass]int64 } -func (r *bvtRule) getPodBvtValue(podQoSClass ext.QoSClass, podKubeQoS corev1.PodQOSClass) int64 { +func (r *bvtRule) getPodBvtValue(podQoSClass ext.QoSClass, podKubeQOS corev1.PodQOSClass) int64 { if val, exist := r.podQOSParams[podQoSClass]; exist { return val } - if val, exist := r.kubeQOSPodParams[podKubeQoS]; exist { + if val, exist := r.kubeQOSPodParams[podKubeQOS]; exist { return val } - return *util.NoneCPUQoS().GroupIdentity + return *util.NoneCPUQOS().GroupIdentity } -func (r *bvtRule) getKubeQoSDirBvtValue(kubeQoS corev1.PodQOSClass) int64 { - if bvtValue, exist := r.kubeQOSDirParams[kubeQoS]; exist { +func (r *bvtRule) getKubeQOSDirBvtValue(kubeQOS corev1.PodQOSClass) int64 { + if bvtValue, exist := r.kubeQOSDirParams[kubeQOS]; exist { return bvtValue } - return *util.NoneCPUQoS().GroupIdentity + return *util.NoneCPUQOS().GroupIdentity } -func (b *bvtPlugin) parseRule(mergedNodeSLO *slov1alpha1.NodeSLOSpec) (bool, error) { +func (b *bvtPlugin) parseRule(mergedNodeSLOIf interface{}) (bool, error) { + mergedNodeSLO := mergedNodeSLOIf.(*slov1alpha1.NodeSLOSpec) + // setting pod rule by qos config - lsrValue := *mergedNodeSLO.ResourceQoSStrategy.LSR.CPUQoS.CPUQoS.GroupIdentity - lsValue := *mergedNodeSLO.ResourceQoSStrategy.LS.CPUQoS.GroupIdentity - beValue := *mergedNodeSLO.ResourceQoSStrategy.BE.CPUQoS.GroupIdentity + lsrValue := *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.CPUQOS.GroupIdentity + lsValue := *mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.GroupIdentity + beValue := *mergedNodeSLO.ResourceQOSStrategy.BEClass.CPUQOS.GroupIdentity // setting besteffort according to BE besteffortDirVal := beValue @@ -69,12 +71,12 @@ func (b *bvtPlugin) parseRule(mergedNodeSLO *slov1alpha1.NodeSLOSpec) (bool, err burstablePodVal := lsValue // NOTICE guaranteed root dir must set as 0 until kernel supported - guaranteedDirVal := *util.NoneCPUQoS().GroupIdentity + guaranteedDirVal := *util.NoneCPUQOS().GroupIdentity // setting guaranteed pod enabled if LS or LSR enabled - guaranteedPodVal := *util.NoneCPUQoS().GroupIdentity - if *mergedNodeSLO.ResourceQoSStrategy.LSR.CPUQoS.Enable { + guaranteedPodVal := *util.NoneCPUQOS().GroupIdentity + if *mergedNodeSLO.ResourceQOSStrategy.LSRClass.CPUQOS.Enable { guaranteedPodVal = lsrValue - } else if *mergedNodeSLO.ResourceQoSStrategy.LS.CPUQoS.Enable { + } else if *mergedNodeSLO.ResourceQOSStrategy.LSClass.CPUQOS.Enable { guaranteedPodVal = lsValue } @@ -107,20 +109,24 @@ func (b *bvtPlugin) ruleUpdateCb(pods []*statesinformer.PodMeta) error { return nil } r := b.getRule() - for _, kubeQoS := range []corev1.PodQOSClass{ + if r == nil { + klog.V(5).Infof("hook plugin rule is nil, nothing to do for plugin %v", name) + return nil + } + for _, kubeQOS := range []corev1.PodQOSClass{ corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { - bvtValue := r.getKubeQoSDirBvtValue(kubeQoS) - kubeQoSCgroupPath := util.GetKubeQosRelativePath(kubeQoS) - if err := sysutil.CgroupFileWrite(kubeQoSCgroupPath, sysutil.CPUBVTWarpNs, strconv.FormatInt(bvtValue, 10)); err != nil { - klog.Infof("update kube qos %v cpu bvt failed, dir %v, error %v", kubeQoS, kubeQoSCgroupPath, err) + bvtValue := r.getKubeQOSDirBvtValue(kubeQOS) + kubeQOSCgroupPath := util.GetKubeQosRelativePath(kubeQOS) + if err := sysutil.CgroupFileWrite(kubeQOSCgroupPath, sysutil.CPUBVTWarpNs, strconv.FormatInt(bvtValue, 10)); err != nil { + klog.Infof("update kube qos %v cpu bvt failed, dir %v, error %v", kubeQOS, kubeQOSCgroupPath, err) } else { - audit.V(2).Group(string(kubeQoS)).Reason(name).Message("set bvt to %v", bvtValue) + audit.V(2).Group(string(kubeQOS)).Reason(name).Message("set bvt to %v", bvtValue) } } for _, podMeta := range pods { - podQoS := ext.GetPodQoSClass(podMeta.Pod) - podKubeQoS := podMeta.Pod.Status.QOSClass - podBvt := r.getPodBvtValue(podQoS, podKubeQoS) + podQOS := ext.GetPodQoSClass(podMeta.Pod) + podKubeQOS := podMeta.Pod.Status.QOSClass + podBvt := r.getPodBvtValue(podQOS, podKubeQOS) podCgroupPath := util.GetPodCgroupDirWithKube(podMeta.CgroupDir) if err := sysutil.CgroupFileWrite(podCgroupPath, sysutil.CPUBVTWarpNs, strconv.FormatInt(podBvt, 10)); err != nil { klog.Infof("update pod %s cpu bvt failed, dir %v, error %v", @@ -135,6 +141,9 @@ func (b *bvtPlugin) ruleUpdateCb(pods []*statesinformer.PodMeta) error { func (b *bvtPlugin) getRule() *bvtRule { b.ruleRWMutex.RLock() defer b.ruleRWMutex.RUnlock() + if b.rule == nil { + return nil + } rule := *b.rule return &rule } diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule_test.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule_test.go index 2a082274c..6dbfd3923 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule_test.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule_test.go @@ -111,27 +111,27 @@ func Test_bvtPlugin_parseRule(t *testing.T) { name: "parse normal rules", args: args{ mergedNodeSLO: &slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(2), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(2), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(-1), }, }, @@ -163,27 +163,27 @@ func Test_bvtPlugin_parseRule(t *testing.T) { name: "parse rules with lsr disabled", args: args{ mergedNodeSLO: &slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(false), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(0), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(2), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(-1), }, }, @@ -215,27 +215,27 @@ func Test_bvtPlugin_parseRule(t *testing.T) { name: "parse rules with lsr and ls disabled", args: args{ mergedNodeSLO: &slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(false), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(0), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(false), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(0), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(-1), }, }, @@ -267,27 +267,27 @@ func Test_bvtPlugin_parseRule(t *testing.T) { name: "parse rules with all disabled", args: args{ mergedNodeSLO: &slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(false), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(0), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(false), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(0), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(false), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(0), }, }, @@ -336,27 +336,27 @@ func Test_bvtPlugin_parseRule(t *testing.T) { }, }, mergedNodeSLO: &slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(2), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(2), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.Bool(true), - CPUQoS: slov1alpha1.CPUQoS{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(-1), }, }, diff --git a/pkg/koordlet/runtimehooks/hooks/hooks.go b/pkg/koordlet/runtimehooks/hooks/hooks.go index 432eb8642..891fb5035 100644 --- a/pkg/koordlet/runtimehooks/hooks/hooks.go +++ b/pkg/koordlet/runtimehooks/hooks/hooks.go @@ -39,9 +39,10 @@ var globalStageHooks map[rmconfig.RuntimeHookType][]*Hook func Register(stage rmconfig.RuntimeHookType, name, description string, hookFn HookFn) *Hook { h, error := generateNewHook(stage, name) if error != nil { - klog.Fatal("hook %s is conflict since name is already registered") + klog.Fatalf("hook %s is conflict since name is already registered", name) return h } + klog.V(1).Infof("hook %s is registered", name) h.description = description h.fn = hookFn return h diff --git a/pkg/koordlet/runtimehooks/protocol/container_context.go b/pkg/koordlet/runtimehooks/protocol/container_context.go index 949f0b9a7..7563a1bcb 100644 --- a/pkg/koordlet/runtimehooks/protocol/container_context.go +++ b/pkg/koordlet/runtimehooks/protocol/container_context.go @@ -17,6 +17,8 @@ limitations under the License. package protocol import ( + "fmt" + "k8s.io/klog/v2" runtimeapi "github.com/koordinator-sh/koordinator/apis/runtime/v1alpha1" @@ -27,12 +29,13 @@ import ( type ContainerMeta struct { Name string - UID string + ID string // docker://xxx; containerd:// } -func (c *ContainerMeta) FromProxy(meta *runtimeapi.ContainerMetadata) { - c.Name = meta.GetName() - c.UID = meta.GetId() +func (c *ContainerMeta) FromProxy(containerMeta *runtimeapi.ContainerMetadata, podAnnotations map[string]string) { + c.Name = containerMeta.GetName() + uid := containerMeta.GetId() + c.ID = getContainerID(podAnnotations, uid) } type ContainerRequest struct { @@ -45,23 +48,24 @@ type ContainerRequest struct { func (c *ContainerRequest) FromProxy(req *runtimeapi.ContainerResourceHookRequest) { c.PodMeta.FromProxy(req.PodMeta) - c.ContainerMeta.FromProxy(req.ContainerMata) + c.ContainerMeta.FromProxy(req.ContainerMata, req.PodAnnotations) c.PodLabels = req.GetPodLabels() c.PodAnnotations = req.GetPodAnnotations() - c.CgroupParent, _ = util.GetContainerCgroupPathWithKubeByID(req.GetPodCgroupParent(), req.ContainerMata.Id) + c.CgroupParent, _ = util.GetContainerCgroupPathWithKubeByID(req.GetPodCgroupParent(), c.ContainerMeta.ID) } -func (c *ContainerRequest) FromReconciler(podMeta *statesinformer.PodMeta, containerID string) { +func (c *ContainerRequest) FromReconciler(podMeta *statesinformer.PodMeta, containerName string) { c.PodMeta.FromReconciler(podMeta.Pod.ObjectMeta) - c.ContainerMeta.UID = containerID + c.ContainerMeta.Name = containerName for _, containerStat := range podMeta.Pod.Status.ContainerStatuses { - if containerStat.ContainerID == containerID { - c.ContainerMeta.Name = containerStat.Name + if containerStat.Name == containerName { + c.ContainerMeta.ID = containerStat.ContainerID + break } } c.PodLabels = podMeta.Pod.Labels c.PodAnnotations = podMeta.Pod.Annotations - c.CgroupParent, _ = util.GetContainerCgroupPathWithKubeByID(podMeta.CgroupDir, containerID) + c.CgroupParent, _ = util.GetContainerCgroupPathWithKubeByID(podMeta.CgroupDir, c.ContainerMeta.ID) } type ContainerResponse struct { @@ -94,8 +98,8 @@ func (c *ContainerContext) ProxyDone(resp *runtimeapi.ContainerResourceHookRespo c.Response.ProxyDone(resp) } -func (c *ContainerContext) FromReconciler(podMeta *statesinformer.PodMeta, containerUID string) { - c.Request.FromReconciler(podMeta, containerUID) +func (c *ContainerContext) FromReconciler(podMeta *statesinformer.PodMeta, containerName string) { + c.Request.FromReconciler(podMeta, containerName) } func (c *ContainerContext) ReconcilerDone() { @@ -112,7 +116,7 @@ func (c *ContainerContext) injectForOrigin() { klog.V(5).Infof("set container %v/%v/%v cpuset %v on cgroup parent %v", c.Request.PodMeta.Namespace, c.Request.PodMeta.Name, c.Request.ContainerMeta.Name, *c.Response.Resources.CPUSet, c.Request.CgroupParent) - audit.V(2).Container(c.Request.ContainerMeta.UID).Reason("runtime-hooks").Message( + audit.V(2).Container(c.Request.ContainerMeta.ID).Reason("runtime-hooks").Message( "set container cpuset to %v", *c.Response.Resources.CPUSet).Do() } } @@ -122,3 +126,12 @@ func (c *ContainerContext) injectForOrigin() { func (c *ContainerContext) injectForExt() { // TODO } + +func getContainerID(podAnnotations map[string]string, containerUID string) string { + // TODO parse from runtime hook request directly + runtimeType := "containerd" + if _, exist := podAnnotations["io.kubernetes.docker.type"]; exist { + runtimeType = "docker" + } + return fmt.Sprintf("%s://%s", runtimeType, containerUID) +} diff --git a/pkg/koordlet/runtimehooks/protocol/pod_context.go b/pkg/koordlet/runtimehooks/protocol/pod_context.go index ca4f1854c..d95511279 100644 --- a/pkg/koordlet/runtimehooks/protocol/pod_context.go +++ b/pkg/koordlet/runtimehooks/protocol/pod_context.go @@ -17,6 +17,7 @@ limitations under the License. package protocol import ( + "github.com/koordinator-sh/koordinator/pkg/util" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2" @@ -61,7 +62,7 @@ func (p *PodRequest) FromReconciler(podMeta *statesinformer.PodMeta) { p.PodMeta.FromReconciler(podMeta.Pod.ObjectMeta) p.Labels = podMeta.Pod.Labels p.Annotations = podMeta.Pod.Annotations - p.CgroupParent = podMeta.CgroupDir + p.CgroupParent = util.GetPodCgroupDirWithKube(podMeta.CgroupDir) } type PodResponse struct { diff --git a/pkg/koordlet/runtimehooks/protocol/protocol.go b/pkg/koordlet/runtimehooks/protocol/protocol.go index 43bf0beb2..f5daa96f0 100644 --- a/pkg/koordlet/runtimehooks/protocol/protocol.go +++ b/pkg/koordlet/runtimehooks/protocol/protocol.go @@ -31,7 +31,7 @@ type HooksProtocol interface { type hooksProtocolBuilder struct { KubeQOS func(kubeQOS corev1.PodQOSClass) HooksProtocol Pod func(podMeta *statesinformer.PodMeta) HooksProtocol - Container func(podMeta *statesinformer.PodMeta, containerUID string) HooksProtocol + Container func(podMeta *statesinformer.PodMeta, containerName string) HooksProtocol } var HooksProtocolBuilder = hooksProtocolBuilder{ @@ -45,9 +45,9 @@ var HooksProtocolBuilder = hooksProtocolBuilder{ p.FromReconciler(podMeta) return p }, - Container: func(podMeta *statesinformer.PodMeta, containerUID string) HooksProtocol { + Container: func(podMeta *statesinformer.PodMeta, containerName string) HooksProtocol { c := &ContainerContext{} - c.FromReconciler(podMeta, containerUID) + c.FromReconciler(podMeta, containerName) return c }, } diff --git a/pkg/koordlet/runtimehooks/proxyserver/service.go b/pkg/koordlet/runtimehooks/proxyserver/service.go index 3f843241f..2650eb8aa 100644 --- a/pkg/koordlet/runtimehooks/proxyserver/service.go +++ b/pkg/koordlet/runtimehooks/proxyserver/service.go @@ -40,6 +40,7 @@ func (s *server) PreRunPodSandboxHook(ctx context.Context, podCtx.FromProxy(req) hooks.RunHooks(rmconfig.PreRunPodSandbox, podCtx) podCtx.ProxyDone(resp) + klog.V(5).Infof("send PreRunPodSandboxHook response %v", resp.String()) return resp, nil } @@ -56,6 +57,7 @@ func (s *server) PostStopPodSandboxHook(ctx context.Context, podCtx.FromProxy(req) hooks.RunHooks(rmconfig.PostStopPodSandbox, podCtx) podCtx.ProxyDone(resp) + klog.V(5).Infof("send PostStopPodSandboxHook response %v", resp.String()) return resp, nil } @@ -71,6 +73,7 @@ func (s *server) PreCreateContainerHook(ctx context.Context, containerCtx.FromProxy(req) hooks.RunHooks(rmconfig.PreCreateContainer, containerCtx) containerCtx.ProxyDone(resp) + klog.V(5).Infof("send PreCreateContainerHook response %v", resp.String()) return resp, nil } @@ -86,6 +89,7 @@ func (s *server) PreStartContainerHook(ctx context.Context, containerCtx.FromProxy(req) hooks.RunHooks(rmconfig.PreStartContainer, containerCtx) containerCtx.ProxyDone(resp) + klog.V(5).Infof("send PreStartContainerHook response %v", resp.String()) return resp, nil } @@ -101,6 +105,7 @@ func (s *server) PostStartContainerHook(ctx context.Context, containerCtx.FromProxy(req) hooks.RunHooks(rmconfig.PostStartContainer, containerCtx) containerCtx.ProxyDone(resp) + klog.V(5).Infof("send PostStartContainerHook response %v", resp.String()) return resp, nil } @@ -116,6 +121,7 @@ func (s *server) PostStopContainerHook(ctx context.Context, containerCtx.FromProxy(req) hooks.RunHooks(rmconfig.PostStopContainer, containerCtx) containerCtx.ProxyDone(resp) + klog.V(5).Infof("send PostStopContainerHook response %v", resp.String()) return resp, nil } @@ -131,5 +137,6 @@ func (s *server) PreUpdateContainerResourcesHook(ctx context.Context, containerCtx.FromProxy(req) hooks.RunHooks(rmconfig.PreUpdateContainerResources, containerCtx) containerCtx.ProxyDone(resp) + klog.V(5).Infof("send PreUpdateContainerResourcesHook response %v", resp.String()) return resp, nil } diff --git a/pkg/koordlet/runtimehooks/reconciler/reconciler.go b/pkg/koordlet/runtimehooks/reconciler/reconciler.go new file mode 100644 index 000000000..85caee3b6 --- /dev/null +++ b/pkg/koordlet/runtimehooks/reconciler/reconciler.go @@ -0,0 +1,190 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reconciler + +import ( + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol" + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" + "github.com/koordinator-sh/koordinator/pkg/util" + "github.com/koordinator-sh/koordinator/pkg/util/system" +) + +const ( + kubeQOSReconcileSeconds = 10 +) + +type ReconcilerLevel string + +const ( + KubeQOSLevel ReconcilerLevel = "kubeqos" + PodLevel ReconcilerLevel = "pod" + ContainerLevel ReconcilerLevel = "container" +) + +// map[string]*cgroupReconciler: key is cgroup filename, +var globalCgroupReconciler = map[ReconcilerLevel]map[string]*cgroupReconciler{ + KubeQOSLevel: {}, + PodLevel: {}, + ContainerLevel: {}, +} + +type cgroupReconciler struct { + cgroupFile system.CgroupFile + fn reconcileFunc + description string +} + +type reconcileFunc func(protocol.HooksProtocol) error + +func RegisterCgroupReconciler(level ReconcilerLevel, cgroupFile system.CgroupFile, + fn reconcileFunc, description string) { + if _, ok := globalCgroupReconciler[level]; !ok { + klog.Fatalf("resource level %v has not init", level) + } + if c, exist := globalCgroupReconciler[level][cgroupFile.ResourceFileName]; exist { + klog.Fatalf("%v already registered by %v", cgroupFile.ResourceFileName, c.description) + } + globalCgroupReconciler[level][cgroupFile.ResourceFileName] = &cgroupReconciler{ + cgroupFile: cgroupFile, + fn: fn, + description: description, + } + klog.V(1).Infof("register reconcile function %v finished, detailed info: level=%v, filename=%v", + description, level, cgroupFile.ResourceFileName) +} + +type Reconciler interface { + Run(stopCh <-chan struct{}) error +} + +func NewReconciler(s statesinformer.StatesInformer) Reconciler { + r := &reconciler{ + podUpdated: make(chan struct{}, 1), + } + // TODO register individual pod event + s.RegisterCallbacks(statesinformer.RegisterTypeAllPods, "runtime-hooks-reconciler", + "Reconcile cgroup files if pod updated", r.podRefreshCallback) + return r +} + +type reconciler struct { + podsMutex sync.RWMutex + podsMeta []*statesinformer.PodMeta + podUpdated chan struct{} +} + +func (c *reconciler) Run(stopCh <-chan struct{}) error { + go c.reconcilePodCgroup(stopCh) + go c.reconcileKubeQOSCgroup(stopCh) + klog.V(1).Infof("start runtime hook reconciler successfully") + return nil +} + +func (c *reconciler) podRefreshCallback(t statesinformer.RegisterType, o interface{}, + podsMeta []*statesinformer.PodMeta) { + c.podsMutex.Lock() + defer c.podsMutex.Unlock() + c.podsMeta = podsMeta + if len(c.podUpdated) == 0 { + c.podUpdated <- struct{}{} + } +} + +func (c *reconciler) getPodsMeta() []*statesinformer.PodMeta { + c.podsMutex.RLock() + defer c.podsMutex.RUnlock() + result := make([]*statesinformer.PodMeta, len(c.podsMeta)) + copy(result, c.podsMeta) + return result +} + +func (c *reconciler) reconcileKubeQOSCgroup(stopCh <-chan struct{}) { + // TODO refactor kubeqos reconciler, inotify watch corresponding cgroup file and update only when receive modified event + duration := time.Duration(kubeQOSReconcileSeconds) * time.Second + timer := time.NewTimer(duration) + defer timer.Stop() + for { + select { + case <-timer.C: + doKubeQOSCgroup() + timer.Reset(duration) + case <-stopCh: + klog.V(1).Infof("stop reconcile kube qos cgroup") + } + } +} + +func doKubeQOSCgroup() { + for _, kubeQOS := range []corev1.PodQOSClass{ + corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { + for _, r := range globalCgroupReconciler[KubeQOSLevel] { + kubeQOSCtx := protocol.HooksProtocolBuilder.KubeQOS(kubeQOS) + if err := r.fn(kubeQOSCtx); err != nil { + klog.Warningf("calling reconcile function %v failed, error %v", r.description, err) + } else { + kubeQOSCtx.ReconcilerDone() + klog.V(5).Infof("calling reconcile function %v for kube qos %v finish", + r.description, kubeQOS) + } + } + } +} + +func (c *reconciler) reconcilePodCgroup(stopCh <-chan struct{}) { + // TODO refactor pod reconciler, inotify watch corresponding cgroup file and update only when receive modified event + // new watcher will be added with new pod created, and deleted with pod destroyed + for { + select { + case <-c.podUpdated: + podsMeta := c.getPodsMeta() + for _, podMeta := range podsMeta { + for _, r := range globalCgroupReconciler[PodLevel] { + podCtx := protocol.HooksProtocolBuilder.Pod(podMeta) + if err := r.fn(podCtx); err != nil { + klog.Warningf("calling reconcile function %v failed, error %v", r.description, err) + } else { + podCtx.ReconcilerDone() + klog.V(5).Infof("calling reconcile function %v for pod %v finished", + r.description, util.GetPodKey(podMeta.Pod)) + } + } + for _, containerStat := range podMeta.Pod.Status.ContainerStatuses { + for _, r := range globalCgroupReconciler[ContainerLevel] { + containerCtx := protocol.HooksProtocolBuilder.Container( + podMeta, containerStat.Name) + if err := r.fn(containerCtx); err != nil { + klog.Warningf("calling reconcile function %v failed, error %v", r.description, err) + } else { + containerCtx.ReconcilerDone() + klog.V(5).Infof("calling reconcile function %v for container %v/%v finish", + r.description, util.GetPodKey(podMeta.Pod), containerStat.Name) + } + } + } + } + case <-stopCh: + klog.V(1).Infof("stop reconcile pod cgroup") + return + } + } +} diff --git a/pkg/koordlet/runtimehooks/reconciler/reconciler_test.go b/pkg/koordlet/runtimehooks/reconciler/reconciler_test.go new file mode 100644 index 000000000..a56164d79 --- /dev/null +++ b/pkg/koordlet/runtimehooks/reconciler/reconciler_test.go @@ -0,0 +1,236 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reconciler + +import ( + "strings" + "testing" + + "github.com/golang/mock/gomock" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol" + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" + mock_statesinformer "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer/mockstatesinformer" + "github.com/koordinator-sh/koordinator/pkg/util/system" +) + +func Test_doKubeQOSCgroup(t *testing.T) { + type args struct { + file system.CgroupFile + targetOutput map[corev1.PodQOSClass]string + } + type wants struct { + kubeQOSVal map[corev1.PodQOSClass]string + } + type gots struct { + kubeQOSVal map[corev1.PodQOSClass]string + } + tests := []struct { + name string + args args + gots gots + wants wants + }{ + { + name: "exec kube qos level function", + args: args{ + file: system.CPUBVTWarpNs, + targetOutput: map[corev1.PodQOSClass]string{ + corev1.PodQOSGuaranteed: "test-guaranteed", + corev1.PodQOSBurstable: "test-burstable", + corev1.PodQOSBestEffort: "test-besteffort", + }, + }, + gots: gots{ + kubeQOSVal: map[corev1.PodQOSClass]string{}, + }, + wants: wants{ + kubeQOSVal: map[corev1.PodQOSClass]string{ + corev1.PodQOSGuaranteed: "test-guaranteed", + corev1.PodQOSBurstable: "test-burstable", + corev1.PodQOSBestEffort: "test-besteffort", + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reconcilerFn := func(proto protocol.HooksProtocol) error { + kubeQOSCtx := proto.(*protocol.KubeQOSContext) + kubeQOS := kubeQOSCtx.Request.KubeQOSClass + tt.gots.kubeQOSVal[kubeQOS] = tt.args.targetOutput[kubeQOS] + return nil + } + RegisterCgroupReconciler(KubeQOSLevel, tt.args.file, reconcilerFn, tt.name) + doKubeQOSCgroup() + assert.Equal(t, tt.wants.kubeQOSVal, tt.gots.kubeQOSVal, "kube qos map value should be equal") + }) + } +} + +func Test_reconciler_reconcilePodCgroup(t *testing.T) { + stopCh := make(chan struct{}, 1) + tryStopFn := func() { + select { + case stopCh <- struct{}{}: + default: + } + } + + genPodKey := func(ns, name string) string { + return strings.Join([]string{ns, name}, "/") + } + genContainerKey := func(ns, podName, containerName string) string { + return strings.Join([]string{ns, podName, containerName}, "/") + } + podLevelOutput := map[string]string{} + containerLevelOutput := map[string]string{} + + podReconcilerFn := func(proto protocol.HooksProtocol) error { + podCtx := proto.(*protocol.PodContext) + podKey := genPodKey(podCtx.Request.PodMeta.Namespace, podCtx.Request.PodMeta.Name) + podLevelOutput[podKey] = podCtx.Request.PodMeta.UID + tryStopFn() + return nil + } + containerReconcilerFn := func(proto protocol.HooksProtocol) error { + containerCtx := proto.(*protocol.ContainerContext) + containerKey := genContainerKey(containerCtx.Request.PodMeta.Namespace, containerCtx.Request.PodMeta.Name, + containerCtx.Request.ContainerMeta.Name) + containerLevelOutput[containerKey] = containerCtx.Request.ContainerMeta.ID + tryStopFn() + return nil + } + RegisterCgroupReconciler(PodLevel, system.CPUBVTWarpNs, podReconcilerFn, "get pod uid") + RegisterCgroupReconciler(ContainerLevel, system.CPUBVTWarpNs, containerReconcilerFn, "get container uid") + + type fields struct { + podsMeta *statesinformer.PodMeta + } + type wants struct { + wantPods map[string]string + wantContainers map[string]string + } + + test := struct { + name string + fields fields + wants wants + }{ + + name: "reconcile pod cgroup to get uid", + fields: fields{ + podsMeta: &statesinformer.PodMeta{ + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-ns", + Name: "test-pod-name", + UID: "test-pod-uid", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container-name", + }, + }, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "test-container-name", + ContainerID: "test-container-id", + }, + }, + }, + }, + }, + }, + wants: wants{ + wantPods: map[string]string{ + genPodKey("test-ns", "test-pod-name"): "test-pod-uid", + }, + wantContainers: map[string]string{ + genContainerKey("test-ns", "test-pod-name", "test-container-name"): "test-container-id", + }, + }, + } + t.Run(test.name, func(t *testing.T) { + c := &reconciler{ + podsMeta: []*statesinformer.PodMeta{test.fields.podsMeta}, + podUpdated: make(chan struct{}, 1), + } + c.podUpdated <- struct{}{} + c.reconcilePodCgroup(stopCh) + assert.Equal(t, test.wants.wantPods, podLevelOutput, "pod reconciler should be equal") + assert.Equal(t, test.wants.wantContainers, containerLevelOutput, "container reconciler should be equal") + }) +} + +func Test_reconciler_podRefreshCallback(t *testing.T) { + type args struct { + podsMeta []*statesinformer.PodMeta + } + tests := []struct { + name string + args args + }{ + { + name: "callback refresh pod meta", + args: args{ + podsMeta: []*statesinformer.PodMeta{ + { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-ns", + Name: "test-name", + }, + }, + CgroupDir: "test-dir", + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := &reconciler{ + podUpdated: make(chan struct{}, 1), + } + c.podRefreshCallback(statesinformer.RegisterTypeAllPods, nil, tt.args.podsMeta) + assert.Equal(t, c.podsMeta, tt.args.podsMeta, "callback update pod meta") + }) + } +} + +func TestNewReconciler(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + si := mock_statesinformer.NewMockStatesInformer(ctrl) + si.EXPECT().RegisterCallbacks(statesinformer.RegisterTypeAllPods, gomock.Any(), gomock.Any(), gomock.Any()) + r := NewReconciler(si) + nr := r.(*reconciler) + stopCh := make(chan struct{}, 1) + stopCh <- struct{}{} + nr.reconcilePodCgroup(stopCh) + stopCh <- struct{}{} + err := r.Run(stopCh) + assert.NoError(t, err, "run reconciler without error") + +} diff --git a/pkg/koordlet/runtimehooks/rule/inject.go b/pkg/koordlet/runtimehooks/rule/inject.go index d900d1589..92993e73e 100644 --- a/pkg/koordlet/runtimehooks/rule/inject.go +++ b/pkg/koordlet/runtimehooks/rule/inject.go @@ -18,6 +18,8 @@ package rule import ( "fmt" + + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" ) type InjectOption interface { @@ -38,11 +40,12 @@ func NewFuncInject(f func(interface{}) error) *funcInject { } } -func WithParseFunc(parseFunc ParseRuleFn) InjectOption { +func WithParseFunc(t statesinformer.RegisterType, parseFunc ParseRuleFn) InjectOption { return NewFuncInject(func(o interface{}) error { switch o := o.(type) { case *Rule: - o.parseFunc = parseFunc + o.parseRuleType = t + o.parseRuleFn = parseFunc default: return fmt.Errorf("WithSystemSupported is invalid for type %T", o) } diff --git a/pkg/koordlet/runtimehooks/rule/rule.go b/pkg/koordlet/runtimehooks/rule/rule.go index 9e57612d3..c3805d1e3 100644 --- a/pkg/koordlet/runtimehooks/rule/rule.go +++ b/pkg/koordlet/runtimehooks/rule/rule.go @@ -23,7 +23,6 @@ import ( "k8s.io/klog/v2" - slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1" "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" "github.com/koordinator-sh/koordinator/pkg/util" ) @@ -31,12 +30,13 @@ import ( type Rule struct { name string description string - parseFunc ParseRuleFn + parseRuleType statesinformer.RegisterType + parseRuleFn ParseRuleFn callbacks []UpdateCbFn systemSupported bool } -type ParseRuleFn func(nodeSLO *slov1alpha1.NodeSLOSpec) (bool, error) +type ParseRuleFn func(interface{}) (bool, error) type UpdateCbFn func(pods []*statesinformer.PodMeta) error type SysSupportFn func() bool @@ -53,6 +53,7 @@ func Register(name, description string, injectOpts ...InjectOption) *Rule { for _, opt := range injectOpts { opt.Apply(r) } + klog.V(5).Infof("new rule %v has registered", name) return r } @@ -71,28 +72,33 @@ func find(name string) (*Rule, bool) { if r, exist := globalHookRules[name]; exist { return r, true } - newRule := &Rule{name: name} + newRule := &Rule{name: name, systemSupported: true} globalHookRules[name] = newRule return newRule, false } -func UpdateRules(s statesinformer.StatesInformer) { - nodeSLO := s.GetNodeSLO() - klog.Infof("applying %v rules with new NodeSLO %v", len(globalHookRules), util.DumpJSON(nodeSLO)) +func UpdateRules(ruleType statesinformer.RegisterType, ruleObj interface{}, podsMeta []*statesinformer.PodMeta) { + klog.V(3).Infof("applying %v rules with new %v, detail: %v", + len(globalHookRules), ruleType.String(), util.DumpJSON(ruleObj)) for _, r := range globalHookRules { + if ruleType != r.parseRuleType { + continue + } if !r.systemSupported { - klog.Infof("system unsupported for rule %s, do nothing during UpdateRules", r.name) + klog.V(4).Infof("system unsupported for rule %s, do nothing during UpdateRules", r.name) return } - updated, err := r.parseFunc(&nodeSLO.Spec) + if r.parseRuleFn == nil { + continue + } + updated, err := r.parseRuleFn(ruleObj) if err != nil { klog.Warningf("parse rule %s from nodeSLO failed, error: %v", r.name, err) continue } if updated { - pods := s.GetAllPods() - klog.Infof("rule %s is updated, run update callback for all %v pods", r.name, len(pods)) - r.runUpdateCallbacks(pods) + klog.V(3).Infof("rule %s is updated, run update callback for all %v pods", r.name, len(podsMeta)) + r.runUpdateCallbacks(podsMeta) } } } diff --git a/pkg/koordlet/runtimehooks/runtimehooks.go b/pkg/koordlet/runtimehooks/runtimehooks.go index a043499eb..e5f91920c 100644 --- a/pkg/koordlet/runtimehooks/runtimehooks.go +++ b/pkg/koordlet/runtimehooks/runtimehooks.go @@ -20,14 +20,13 @@ import ( "k8s.io/klog/v2" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/proxyserver" - //"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/reconciler" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/reconciler" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/rule" "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" ) type HookPlugin interface { Register() - SystemSupported() bool } type RuntimeHook interface { @@ -37,7 +36,7 @@ type RuntimeHook interface { type runtimeHook struct { statesInformer statesinformer.StatesInformer server proxyserver.Server - //reconciler reconciler.Reconciler + reconciler reconciler.Reconciler } func (r *runtimeHook) Run(stopCh <-chan struct{}) error { @@ -45,9 +44,9 @@ func (r *runtimeHook) Run(stopCh <-chan struct{}) error { if err := r.server.Start(); err != nil { return err } - //if err := r.reconciler.Run(stopCh); err != nil { - // return err - //} + if err := r.reconciler.Run(stopCh); err != nil { + return err + } klog.V(5).Infof("runtime hook server has started") <-stopCh klog.Infof("runtime hook is stopped") @@ -62,12 +61,15 @@ func NewRuntimeHook(si statesinformer.StatesInformer, cfg *Config) (RuntimeHook, r := &runtimeHook{ statesInformer: si, server: s, - //reconciler: reconciler.NewReconciler(si), + reconciler: reconciler.NewReconciler(si), } registerPlugins() - si.RegisterCallbacks(statesinformer.RegisterTypeNodeSLO, "runtime-hooks-rule", + si.RegisterCallbacks(statesinformer.RegisterTypeNodeSLOSpec, "runtime-hooks-rule-node-slo", "Update hooks rule can run callbacks if NodeSLO spec update", rule.UpdateRules) + si.RegisterCallbacks(statesinformer.RegisterTypeNodeTopology, "runtime-hooks-rule-node-topo", + "Update hooks rule if NodeTopology infor update", + rule.UpdateRules) if err := s.Setup(); err != nil { klog.Fatal("failed to setup runtime hook server, error %v", err) return nil, err diff --git a/pkg/koordlet/runtimehooks/runtimehooks_test.go b/pkg/koordlet/runtimehooks/runtimehooks_test.go index 0ff053e28..0e59b6c2c 100644 --- a/pkg/koordlet/runtimehooks/runtimehooks_test.go +++ b/pkg/koordlet/runtimehooks/runtimehooks_test.go @@ -18,13 +18,11 @@ package runtimehooks import ( "path" - "reflect" "testing" "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" - slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1" mockstatesinformer "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer/mockstatesinformer" ) @@ -66,11 +64,12 @@ func Test_runtimeHook_Run(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() si := mockstatesinformer.NewMockStatesInformer(ctrl) - si.EXPECT().RegisterCallbacks(reflect.TypeOf(&slov1alpha1.NodeSLO{}), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() + si.EXPECT().RegisterCallbacks(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() r, err := NewRuntimeHook(si, tt.fields.config) assert.NoError(t, err) stop := make(chan struct{}) - go func() { stop <- struct{}{} }() + + go func() { close(stop) }() err = r.Run(stop) assert.NoError(t, err) }) diff --git a/pkg/koordlet/statesinformer/kubelet_stub.go b/pkg/koordlet/statesinformer/kubelet_stub.go index f5c2dde5e..7151e7d02 100644 --- a/pkg/koordlet/statesinformer/kubelet_stub.go +++ b/pkg/koordlet/statesinformer/kubelet_stub.go @@ -20,12 +20,14 @@ import ( "encoding/json" "fmt" "io/ioutil" + "net" "net/http" + "net/url" + "strconv" "time" corev1 "k8s.io/api/core/v1" - utilnet "k8s.io/apimachinery/pkg/util/net" - "k8s.io/client-go/transport" + "k8s.io/client-go/rest" ) type KubeletStub interface { @@ -35,57 +37,44 @@ type KubeletStub interface { type kubeletStub struct { addr string port int + scheme string httpClient *http.Client } -func NewKubeletStub(addr string, port int, timeout time.Duration, token string) (KubeletStub, error) { - preTlsConfig := makeTransportConfig(token, true) - tlsConfig, err := transport.TLSConfigFor(preTlsConfig) - if err != nil { - return nil, err - } - rt := http.DefaultTransport - if tlsConfig != nil { - // If SSH Tunnel is turned on - rt = utilnet.SetOldTransportDefaults(&http.Transport{ - TLSClientConfig: tlsConfig, - }) - } - roundTripper, err := transport.HTTPWrappersForConfig(makeTransportConfig(token, true), rt) - if err != nil { - return nil, err - } +func NewKubeletStub(addr string, port int, scheme string, timeout time.Duration, cfg *rest.Config) (KubeletStub, error) { client := &http.Client{ - Timeout: timeout, - Transport: roundTripper, + Timeout: timeout, + } + if cfg != nil && rest.IsConfigTransportTLS(*cfg) { + transport, err := rest.TransportFor(cfg) + if err != nil { + return nil, err + } + client.Transport = transport } + return &kubeletStub{ httpClient: client, addr: addr, port: port, + scheme: scheme, }, nil } -func makeTransportConfig(token string, insecure bool) *transport.Config { - tlsConfig := &transport.Config{ - BearerToken: token, - TLS: transport.TLSConfig{ - Insecure: true, - }, - } - return tlsConfig -} - func (k *kubeletStub) GetAllPods() (corev1.PodList, error) { + url := url.URL{ + Scheme: k.scheme, + Host: net.JoinHostPort(k.addr, strconv.Itoa(k.port)), + Path: "/pods/", + } podList := corev1.PodList{} - url := fmt.Sprintf("https://%v:%d/pods/", k.addr, k.port) - rsp, err := k.httpClient.Get(url) + rsp, err := k.httpClient.Get(url.String()) if err != nil { return podList, err } defer rsp.Body.Close() if rsp.StatusCode != http.StatusOK { - return podList, fmt.Errorf("request %s failed, code %d", url, rsp.StatusCode) + return podList, fmt.Errorf("request %s failed, code %d", url.String(), rsp.StatusCode) } body, err := ioutil.ReadAll(rsp.Body) diff --git a/pkg/koordlet/statesinformer/kubelet_stub_test.go b/pkg/koordlet/statesinformer/kubelet_stub_test.go index 834a711d4..bf9369dee 100644 --- a/pkg/koordlet/statesinformer/kubelet_stub_test.go +++ b/pkg/koordlet/statesinformer/kubelet_stub_test.go @@ -24,7 +24,6 @@ import ( "net/http" "net/http/httptest" "net/url" - "reflect" "strconv" "strings" "testing" @@ -32,7 +31,7 @@ import ( "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" - "k8s.io/client-go/transport" + "k8s.io/client-go/rest" ) var ( @@ -91,9 +90,17 @@ func Test_kubeletStub_GetAllPods(t *testing.T) { if err != nil { t.Fatal(err) } + port, _ := strconv.Atoi(portStr) + cfg := &rest.Config{ + Host: net.JoinHostPort(address, portStr), + BearerToken: token, + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + }, + } - client, err := NewKubeletStub(address, port, 10*time.Second, token) + client, err := NewKubeletStub(address, port, "https", 10*time.Second, cfg) if err != nil { t.Fatal(err) } @@ -108,9 +115,11 @@ func TestNewKubeletStub(t *testing.T) { type args struct { addr string port int + scheme string timeout time.Duration - token string + config *rest.Config } + tests := []struct { name string args args @@ -121,14 +130,20 @@ func TestNewKubeletStub(t *testing.T) { args: args{ addr: "127.0.0.1", port: 10250, + scheme: "https", timeout: 10 * time.Second, - token: "test_token", + config: &rest.Config{ + BearerToken: token, + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + }, + }, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := NewKubeletStub(tt.args.addr, tt.args.port, tt.args.timeout, tt.args.token) + got, err := NewKubeletStub(tt.args.addr, tt.args.port, tt.args.scheme, tt.args.timeout, tt.args.config) if (err != nil) != tt.wantErr { t.Errorf("NewKubeletStub() error = %v, wantErr %v", err, tt.wantErr) return @@ -137,37 +152,3 @@ func TestNewKubeletStub(t *testing.T) { }) } } - -func Test_makeTransportConfig(t *testing.T) { - inToken := "test_token" - ts := &transport.Config{ - BearerToken: inToken, - TLS: transport.TLSConfig{ - Insecure: true, - }, - } - type args struct { - token string - insecure bool - } - tests := []struct { - name string - args args - want *transport.Config - }{ - { - name: "transport", - args: args{ - token: inToken, - }, - want: ts, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := makeTransportConfig(tt.args.token, tt.args.insecure); !reflect.DeepEqual(got, tt.want) { - t.Errorf("makeTransportConfig() = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/pkg/koordlet/statesinformer/mockstatesinformer/mock.go b/pkg/koordlet/statesinformer/mockstatesinformer/mock.go index f58eacd7f..437388366 100644 --- a/pkg/koordlet/statesinformer/mockstatesinformer/mock.go +++ b/pkg/koordlet/statesinformer/mockstatesinformer/mock.go @@ -109,7 +109,7 @@ func (mr *MockStatesInformerMockRecorder) HasSynced() *gomock.Call { } // RegisterCallbacks mocks base method. -func (m *MockStatesInformer) RegisterCallbacks(objType reflect.Type, name, description string, callbackFn statesinformer.UpdateCbFn) { +func (m *MockStatesInformer) RegisterCallbacks(objType statesinformer.RegisterType, name, description string, callbackFn statesinformer.UpdateCbFn) { m.ctrl.T.Helper() m.ctrl.Call(m, "RegisterCallbacks", objType, name, description, callbackFn) } diff --git a/pkg/koordlet/statesinformer/register.go b/pkg/koordlet/statesinformer/register.go index 7cc921f4d..4b520eaf4 100644 --- a/pkg/koordlet/statesinformer/register.go +++ b/pkg/koordlet/statesinformer/register.go @@ -17,34 +17,47 @@ limitations under the License. package statesinformer import ( - "reflect" - "k8s.io/klog/v2" - - slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1" ) -var ( - RegisterTypeNodeSLO = reflect.TypeOf(&slov1alpha1.NodeSLO{}) - RegisterTypePod = reflect.TypeOf(&PodMeta{}) +type RegisterType int64 + +const ( + RegisterTypeNodeSLOSpec RegisterType = iota + RegisterTypeAllPods + RegisterTypeNodeTopology ) +func (r RegisterType) String() string { + switch r { + case RegisterTypeNodeSLOSpec: + return "RegisterTypeNodeSLOSpec" + case RegisterTypeAllPods: + return "RegisterTypeAllPods" + case RegisterTypeNodeTopology: + return "RegisterTypeNodeTopology" + default: + return "RegisterTypeUnknown" + } +} + type updateCallback struct { name string description string fn UpdateCbFn } -type UpdateCbFn func(s StatesInformer) +type UpdateCbCtx struct{} +type UpdateCbFn func(t RegisterType, obj interface{}, pods []*PodMeta) -func (s *statesInformer) RegisterCallbacks(objType reflect.Type, name, description string, callbackFn UpdateCbFn) { - callbacks, legal := s.stateUpdateCallbacks[objType] +func (s *statesInformer) RegisterCallbacks(rType RegisterType, name, description string, callbackFn UpdateCbFn) { + callbacks, legal := s.stateUpdateCallbacks[rType] if !legal { - klog.Fatalf("states informer callback register with type %v is illegal", objType) + klog.Fatalf("states informer callback register with type %v is illegal", rType.String()) } for _, c := range callbacks { if c.name == name { - klog.Fatalf("states informer callback register %s with type %v already registered", name, objType) + klog.Fatalf("states informer callback register %s with type %v already registered", name, rType.String()) } } newCb := updateCallback{ @@ -52,11 +65,11 @@ func (s *statesInformer) RegisterCallbacks(objType reflect.Type, name, descripti description: description, fn: callbackFn, } - s.stateUpdateCallbacks[objType] = append(s.stateUpdateCallbacks[objType], newCb) - klog.Infof("states informer callback %s has registered", name) + s.stateUpdateCallbacks[rType] = append(s.stateUpdateCallbacks[rType], newCb) + klog.V(1).Infof("states informer callback %s has registered for type %v", name, rType.String()) } -func (s *statesInformer) sendCallbacks(objType reflect.Type) { +func (s *statesInformer) sendCallbacks(objType RegisterType) { if _, exist := s.callbackChans[objType]; exist { select { case s.callbackChans[objType] <- struct{}{}: @@ -65,19 +78,20 @@ func (s *statesInformer) sendCallbacks(objType reflect.Type) { klog.Infof("last callback runner %v has not finished, ignore this time", objType.String()) } } else { - klog.Warningf("callback runner %v is not exist", objType.Name()) + klog.Warningf("callback runner %v is not exist", objType.String()) } } -func (s *statesInformer) runCallbacks(objType reflect.Type, obj interface{}) { +func (s *statesInformer) runCallbacks(objType RegisterType, obj interface{}) { callbacks, exist := s.stateUpdateCallbacks[objType] if !exist { klog.Errorf("states informer callbacks type %v not exist", objType.String()) return } + pods := s.GetAllPods() for _, c := range callbacks { klog.V(5).Infof("start running callback function %v for type %v", c.name, objType.String()) - c.fn(s) + c.fn(objType, obj, pods) } } @@ -87,8 +101,8 @@ func (s *statesInformer) startCallbackRunners(stopCh <-chan struct{}) { go func() { for { select { - case <-s.callbackChans[cbType]: - cbObj := s.getObjByType(cbType) + case cbCtx := <-s.callbackChans[cbType]: + cbObj := s.getObjByType(cbType, cbCtx) if cbObj == nil { klog.Warningf("callback runner with type %v is not exist") } else { @@ -103,10 +117,18 @@ func (s *statesInformer) startCallbackRunners(stopCh <-chan struct{}) { } } -func (s *statesInformer) getObjByType(objType reflect.Type) interface{} { +func (s *statesInformer) getObjByType(objType RegisterType, cbCtx UpdateCbCtx) interface{} { switch objType { - case reflect.TypeOf(&slov1alpha1.NodeSLO{}): - return s.GetNodeSLO() + case RegisterTypeNodeSLOSpec: + nodeSLO := s.GetNodeSLO() + if nodeSLO != nil { + return &nodeSLO.Spec + } + return nil + case RegisterTypeAllPods: + return &struct{}{} + case RegisterTypeNodeTopology: + return s.getNodeTopo() } return nil } diff --git a/pkg/koordlet/statesinformer/register_test.go b/pkg/koordlet/statesinformer/register_test.go index da4261db2..a3920c3e6 100644 --- a/pkg/koordlet/statesinformer/register_test.go +++ b/pkg/koordlet/statesinformer/register_test.go @@ -17,7 +17,6 @@ limitations under the License. package statesinformer import ( - "reflect" "testing" "github.com/stretchr/testify/assert" @@ -29,7 +28,7 @@ import ( func TestRegisterCallbacksAndRun(t *testing.T) { type args struct { - objType reflect.Type + objType RegisterType name string description string } @@ -38,9 +37,25 @@ func TestRegisterCallbacksAndRun(t *testing.T) { args args }{ { - name: "register and run", + name: "register RegisterTypeNodeSLOSpec and run", args: args{ - objType: reflect.TypeOf(&slov1alpha1.NodeSLO{}), + objType: RegisterTypeNodeSLOSpec, + name: "set-bool-var", + description: "set test bool var as true", + }, + }, + { + name: "register RegisterTypeAllPods and run", + args: args{ + objType: RegisterTypeAllPods, + name: "set-bool-var", + description: "set test bool var as true", + }, + }, + { + name: "register RegisterTypeNodeSLOSpec and run", + args: args{ + objType: RegisterTypeNodeSLOSpec, name: "set-bool-var", description: "set test bool var as true", }, @@ -49,15 +64,18 @@ func TestRegisterCallbacksAndRun(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { testVar := pointer.BoolPtr(false) - callbackFn := func(si StatesInformer) { + callbackFn := func(t RegisterType, obj interface{}, pods []*PodMeta) { *testVar = true } si := &statesInformer{ - stateUpdateCallbacks: map[reflect.Type][]updateCallback{ - reflect.TypeOf(&slov1alpha1.NodeSLO{}): {}, + stateUpdateCallbacks: map[RegisterType][]updateCallback{ + RegisterTypeNodeSLOSpec: {}, + RegisterTypeAllPods: {}, + RegisterTypeNodeTopology: {}, }, } si.RegisterCallbacks(tt.args.objType, tt.args.name, tt.args.description, callbackFn) + si.getObjByType(tt.args.objType, UpdateCbCtx{}) si.runCallbacks(tt.args.objType, &slov1alpha1.NodeSLO{}) assert.Equal(t, *testVar, true) }) @@ -66,9 +84,16 @@ func TestRegisterCallbacksAndRun(t *testing.T) { func Test_statesInformer_startCallbackRunners(t *testing.T) { output := make(chan string, 1) + nodeSLO := &slov1alpha1.NodeSLO{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "test-label-key": "test-label-val1", + }, + }, + } stopCh := make(chan struct{}, 1) type args struct { - objType reflect.Type + objType RegisterType nodeSLO *slov1alpha1.NodeSLO name string description string @@ -82,18 +107,11 @@ func Test_statesInformer_startCallbackRunners(t *testing.T) { { name: "callback get nodeslo label", args: args{ - objType: reflect.TypeOf(&slov1alpha1.NodeSLO{}), - nodeSLO: &slov1alpha1.NodeSLO{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - "test-label-key": "test-label-val1", - }, - }, - }, + objType: RegisterTypeNodeSLOSpec, + nodeSLO: nodeSLO, name: "get value from node slo label", description: "get value from node slo label", - fn: func(s StatesInformer) { - nodeSLO := s.GetNodeSLO() + fn: func(t RegisterType, obj interface{}, pods []*PodMeta) { output <- nodeSLO.Labels["test-label-key"] stopCh <- struct{}{} }, @@ -105,10 +123,10 @@ func Test_statesInformer_startCallbackRunners(t *testing.T) { t.Run(tt.name, func(t *testing.T) { si := &statesInformer{ nodeSLO: tt.args.nodeSLO, - callbackChans: map[reflect.Type]chan struct{}{ - tt.args.objType: make(chan struct{}, 1), + callbackChans: map[RegisterType]chan UpdateCbCtx{ + tt.args.objType: make(chan UpdateCbCtx, 1), }, - stateUpdateCallbacks: map[reflect.Type][]updateCallback{ + stateUpdateCallbacks: map[RegisterType][]updateCallback{ tt.args.objType: {}, }, } diff --git a/pkg/koordlet/statesinformer/states_informer.go b/pkg/koordlet/statesinformer/states_informer.go index 02518d9de..233c0c78f 100644 --- a/pkg/koordlet/statesinformer/states_informer.go +++ b/pkg/koordlet/statesinformer/states_informer.go @@ -19,11 +19,10 @@ package statesinformer import ( "context" "fmt" - "io/ioutil" - "reflect" "sync" "time" + topov1alpha1 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha1" topologyclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned" _ "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned/scheme" "go.uber.org/atomic" @@ -34,8 +33,10 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/watch" clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client/config" slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1" koordclientset "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned" @@ -45,7 +46,8 @@ import ( ) const ( - tokenPath = "/var/run/secrets/kubernetes.io/serviceaccount/token" + HTTPScheme = "http" + HTTPSScheme = "https" ) type StatesInformer interface { @@ -57,7 +59,7 @@ type StatesInformer interface { GetAllPods() []*PodMeta - RegisterCallbacks(objType reflect.Type, name, description string, callbackFn UpdateCbFn) + RegisterCallbacks(objType RegisterType, name, description string, callbackFn UpdateCbFn) } type statesInformer struct { @@ -76,6 +78,8 @@ type statesInformer struct { nodeSLORWMutex sync.RWMutex nodeSLO *slov1alpha1.NodeSLO + nodeTopoMutex sync.RWMutex + nodeTopology *topov1alpha1.NodeResourceTopology topologyClient topologyclientset.Interface podRWMutex sync.RWMutex @@ -83,8 +87,8 @@ type statesInformer struct { podUpdatedTime time.Time metricsCache metriccache.MetricCache - callbackChans map[reflect.Type]chan struct{} - stateUpdateCallbacks map[reflect.Type][]updateCallback + callbackChans map[RegisterType]chan UpdateCbCtx + stateUpdateCallbacks map[RegisterType][]updateCallback } func NewStatesInformer(config *Config, kubeClient clientset.Interface, crdClient koordclientset.Interface, topologyClient *topologyclientset.Clientset, metricsCache metriccache.MetricCache, pleg pleg.Pleg, nodeName string) StatesInformer { @@ -103,11 +107,15 @@ func NewStatesInformer(config *Config, kubeClient clientset.Interface, crdClient podMap: map[string]*PodMeta{}, podCreated: make(chan string, 1), // set 1 buffer - callbackChans: map[reflect.Type]chan struct{}{ - reflect.TypeOf(&slov1alpha1.NodeSLO{}): make(chan struct{}, 1), + callbackChans: map[RegisterType]chan UpdateCbCtx{ + RegisterTypeNodeSLOSpec: make(chan UpdateCbCtx, 1), + RegisterTypeAllPods: make(chan UpdateCbCtx, 1), + RegisterTypeNodeTopology: make(chan UpdateCbCtx, 1), }, - stateUpdateCallbacks: map[reflect.Type][]updateCallback{ - reflect.TypeOf(&slov1alpha1.NodeSLO{}): {}, + stateUpdateCallbacks: map[RegisterType][]updateCallback{ + RegisterTypeNodeSLOSpec: {}, + RegisterTypeAllPods: {}, + RegisterTypeNodeTopology: {}, }, topologyClient: topologyClient, metricsCache: metricsCache, @@ -129,7 +137,7 @@ func (s *statesInformer) Run(stopCh <-chan struct{}) error { return fmt.Errorf("timed out waiting for states informer caches to sync") } - stub, err := newKubeletStub(s.GetNode(), s.config.KubeletPreferredAddressType, s.config.KubeletSyncTimeout, tokenPath) + stub, err := newKubeletStubFromConfig(s.GetNode(), s.config) if err != nil { klog.ErrorS(err, "create kubelet stub") return err @@ -240,10 +248,14 @@ func (s *statesInformer) setupInformers() { s.setupNodeSLOInformer() } -func newKubeletStub(node *corev1.Node, addressPreferred string, timeout time.Duration, tokenPath string) (KubeletStub, error) { +func newKubeletStubFromConfig(node *corev1.Node, cfg *Config) (KubeletStub, error) { var address string var err error - addressPreferredType := corev1.NodeAddressType(addressPreferred) + var port int + var scheme string + var restConfig *rest.Config + + addressPreferredType := corev1.NodeAddressType(cfg.KubeletPreferredAddressType) // if the address of the specified type has not been set or error type, InternalIP will be used. if !util.IsNodeAddressTypeSupported(addressPreferredType) { klog.Warningf("Wrong address type or empty type, InternalIP will be used, error: (%+v).", addressPreferredType) @@ -251,12 +263,24 @@ func newKubeletStub(node *corev1.Node, addressPreferred string, timeout time.Dur } address, err = util.GetNodeAddress(node, addressPreferredType) if err != nil { - klog.Fatalf("Get node address error: %v type(%s) ", err, addressPreferred) - } - token, err := ioutil.ReadFile(tokenPath) - if err != nil { + klog.Fatalf("Get node address error: %v type(%s) ", err, cfg.KubeletPreferredAddressType) return nil, err } - kubeletEndpointPort := node.Status.DaemonEndpoints.KubeletEndpoint.Port - return NewKubeletStub(address, int(kubeletEndpointPort), timeout, string(token)) + + if cfg.InsecureKubeletTLS { + port = int(cfg.KubeletReadOnlyPort) + scheme = HTTPScheme + } else { + restConfig, err = config.GetConfig() + if err != nil { + return nil, err + } + restConfig.TLSClientConfig.Insecure = true + restConfig.TLSClientConfig.CAData = nil + restConfig.TLSClientConfig.CAFile = "" + port = int(node.Status.DaemonEndpoints.KubeletEndpoint.Port) + scheme = HTTPSScheme + } + + return NewKubeletStub(address, port, scheme, cfg.KubeletSyncTimeout, restConfig) } diff --git a/pkg/koordlet/statesinformer/states_informer_test.go b/pkg/koordlet/statesinformer/states_informer_test.go index 782e3e82a..c345749ef 100644 --- a/pkg/koordlet/statesinformer/states_informer_test.go +++ b/pkg/koordlet/statesinformer/states_informer_test.go @@ -18,15 +18,18 @@ package statesinformer import ( "errors" - "io/ioutil" + "net" "os" "path/filepath" + "strings" "testing" "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientsetfake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" koordclientfake "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned/fake" "github.com/koordinator-sh/koordinator/pkg/koordlet/metrics" @@ -226,19 +229,21 @@ func Test_newKubeletStub(t *testing.T) { }, }, } - tokenContent := "test_token" - f, err := ioutil.TempFile("", "token") - if err != nil { - t.Fatal(err) + + dir := t.TempDir() + cfg := &rest.Config{ + Host: net.JoinHostPort("127.0.0.1", "10250"), + BearerToken: token, + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + }, } - defer os.Remove(f.Name()) - f.WriteString(tokenContent) - kubeStub, _ := NewKubeletStub("127.0.0.7", 10250, 10, tokenContent) + setConfigs(t, dir) + + kubeStub, _ := NewKubeletStub("127.0.0.1", 10250, "https", 10, cfg) type args struct { - node *corev1.Node - addressPreferred string - timeout time.Duration - tokenPath string + node *corev1.Node + cfg *Config } tests := []struct { name string @@ -249,10 +254,13 @@ func Test_newKubeletStub(t *testing.T) { { name: "NodeInternalIP", args: args{ - node: testingNode, - addressPreferred: string(corev1.NodeInternalIP), - timeout: 10 * time.Second, - tokenPath: f.Name(), + node: testingNode, + cfg: &Config{ + KubeletPreferredAddressType: string(corev1.NodeInternalIP), + KubeletSyncTimeout: 10 * time.Second, + InsecureKubeletTLS: true, + KubeletReadOnlyPort: 10250, + }, }, want: kubeStub, wantErr: false, @@ -260,29 +268,35 @@ func Test_newKubeletStub(t *testing.T) { { name: "Empty IP", args: args{ - node: testingNode, - addressPreferred: "", - timeout: 10 * time.Second, - tokenPath: f.Name(), + node: testingNode, + cfg: &Config{ + KubeletPreferredAddressType: "", + KubeletSyncTimeout: 10 * time.Second, + InsecureKubeletTLS: true, + KubeletReadOnlyPort: 10250, + }, }, want: kubeStub, wantErr: false, }, { - name: "Error Path", + name: "HTTPS", args: args{ - node: testingNode, - addressPreferred: "", - timeout: 10 * time.Second, - tokenPath: "", + node: testingNode, + cfg: &Config{ + KubeletPreferredAddressType: "", + KubeletSyncTimeout: 10 * time.Second, + InsecureKubeletTLS: false, + KubeletReadOnlyPort: 10250, + }, }, want: nil, - wantErr: true, + wantErr: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := newKubeletStub(tt.args.node, tt.args.addressPreferred, tt.args.timeout, tt.args.tokenPath) + got, err := newKubeletStubFromConfig(tt.args.node, tt.args.cfg) if (err != nil) != tt.wantErr { t.Errorf("newKubeletStub() error = %v, wantErr %v", err, tt.wantErr) return @@ -294,6 +308,35 @@ func Test_newKubeletStub(t *testing.T) { } } +func setConfigs(t *testing.T, dir string) { + // Set KUBECONFIG env value + kubeconfigEnvPath := filepath.Join(dir, "kubeconfig-text-context") + os.WriteFile(kubeconfigEnvPath, []byte(genKubeconfig("from-env")), 0644) + t.Setenv(clientcmd.RecommendedConfigPathEnvVar, kubeconfigEnvPath) +} + +func genKubeconfig(contexts ...string) string { + var sb strings.Builder + sb.WriteString("---\napiVersion: v1\nkind: Config\nclusters:\n") + for _, ctx := range contexts { + sb.WriteString("- cluster:\n server: " + ctx + "\n name: " + ctx + "\n") + } + sb.WriteString("contexts:\n") + for _, ctx := range contexts { + sb.WriteString("- context:\n cluster: " + ctx + "\n user: " + ctx + "\n name: " + ctx + "\n") + } + + sb.WriteString("users:\n") + for _, ctx := range contexts { + sb.WriteString("- name: " + ctx + "\n") + } + sb.WriteString("preferences: {}\n") + if len(contexts) > 0 { + sb.WriteString("current-context: " + contexts[0] + "\n") + } + return sb.String() +} + func Test_statesInformer_syncKubeletLoop(t *testing.T) { client := clientsetfake.NewSimpleClientset() crdClient := koordclientfake.NewSimpleClientset() diff --git a/pkg/koordlet/statesinformer/states_noderesourcetopology.go b/pkg/koordlet/statesinformer/states_noderesourcetopology.go index f6105e205..a310b57aa 100644 --- a/pkg/koordlet/statesinformer/states_noderesourcetopology.go +++ b/pkg/koordlet/statesinformer/states_noderesourcetopology.go @@ -106,6 +106,8 @@ func (s *statesInformer) reportNodeTopology() { if topology.Annotations == nil { topology.Annotations = make(map[string]string) } + // TODO only update if necessary + s.updateNodeTopo(topology) if topology.Annotations[extension.AnnotationNodeCPUTopology] == string(cpuTopologyJson) && topology.Annotations[extension.AnnotationNodeCPUSharedPools] == string(cpuSharePoolsJson) { return nil } @@ -189,3 +191,21 @@ func (s *statesInformer) calCpuTopology() (*extension.CPUTopology, map[int32]*ex } return cpuTopology, usedCPUs, nil } + +func (s *statesInformer) updateNodeTopo(newTopo *v1alpha1.NodeResourceTopology) { + s.setNodeTopo(newTopo) + klog.V(5).Infof("local node topology info updated %v", newTopo) + s.sendCallbacks(RegisterTypeNodeTopology) +} + +func (s *statesInformer) setNodeTopo(newTopo *v1alpha1.NodeResourceTopology) { + s.nodeTopoMutex.Lock() + defer s.nodeTopoMutex.Unlock() + s.nodeTopology = newTopo +} + +func (s *statesInformer) getNodeTopo() *v1alpha1.NodeResourceTopology { + s.nodeTopoMutex.RLock() + defer s.nodeTopoMutex.RUnlock() + return s.nodeTopology.DeepCopy() +} diff --git a/pkg/koordlet/statesinformer/states_nodeslo.go b/pkg/koordlet/statesinformer/states_nodeslo.go index 0ff071808..d0db20166 100644 --- a/pkg/koordlet/statesinformer/states_nodeslo.go +++ b/pkg/koordlet/statesinformer/states_nodeslo.go @@ -63,7 +63,7 @@ func (s *statesInformer) setupNodeSLOInformer() { func (s *statesInformer) updateNodeSLOSpec(nodeSLO *slov1alpha1.NodeSLO) { s.setNodeSLOSpec(nodeSLO) - s.sendCallbacks(RegisterTypeNodeSLO) + s.sendCallbacks(RegisterTypeNodeSLOSpec) } func (s *statesInformer) setNodeSLOSpec(nodeSLO *slov1alpha1.NodeSLO) { @@ -98,12 +98,12 @@ func (s *statesInformer) mergeNodeSLOSpec(nodeSLO *slov1alpha1.NodeSLO) { s.nodeSLO.Spec.ResourceUsedThresholdWithBE = mergedResourceUsedThresholdWithBESpec } - // merge ResourceQoSStrategy - mergedResourceQoSStrategySpec := mergeSLOSpecResourceQoSStrategy(util.DefaultNodeSLOSpecConfig().ResourceQoSStrategy, - nodeSLO.Spec.ResourceQoSStrategy) - mergeNoneResourceQoSIfDisabled(mergedResourceQoSStrategySpec) - if mergedResourceQoSStrategySpec != nil { - s.nodeSLO.Spec.ResourceQoSStrategy = mergedResourceQoSStrategySpec + // merge ResourceQOSStrategy + mergedResourceQOSStrategySpec := mergeSLOSpecResourceQOSStrategy(util.DefaultNodeSLOSpecConfig().ResourceQOSStrategy, + nodeSLO.Spec.ResourceQOSStrategy) + mergeNoneResourceQOSIfDisabled(mergedResourceQOSStrategySpec) + if mergedResourceQOSStrategySpec != nil { + s.nodeSLO.Spec.ResourceQOSStrategy = mergedResourceQOSStrategySpec } // merge CPUBurstStrategy @@ -128,9 +128,9 @@ func mergeSLOSpecResourceUsedThresholdWithBE(defaultSpec, newSpec *slov1alpha1.R return out } -func mergeSLOSpecResourceQoSStrategy(defaultSpec, - newSpec *slov1alpha1.ResourceQoSStrategy) *slov1alpha1.ResourceQoSStrategy { - spec := &slov1alpha1.ResourceQoSStrategy{} +func mergeSLOSpecResourceQOSStrategy(defaultSpec, + newSpec *slov1alpha1.ResourceQOSStrategy) *slov1alpha1.ResourceQOSStrategy { + spec := &slov1alpha1.ResourceQOSStrategy{} if newSpec != nil { spec = newSpec } @@ -156,59 +156,59 @@ func mergeSLOSpecCPUBurstStrategy(defaultSpec, return out } -// mergeNoneResourceQoSIfDisabled complete ResourceQoSStrategy according to enable statuses of qos features -func mergeNoneResourceQoSIfDisabled(resourceQoS *slov1alpha1.ResourceQoSStrategy) { - mergeNoneCPUQoSIfDisabled(resourceQoS) - mergeNoneResctrlQoSIfDisabled(resourceQoS) - mergeNoneMemoryQoSIfDisabled(resourceQoS) - klog.V(5).Infof("get merged node ResourceQoS %v", util.DumpJSON(resourceQoS)) +// mergeNoneResourceQOSIfDisabled complete ResourceQOSStrategy according to enable statuses of qos features +func mergeNoneResourceQOSIfDisabled(resourceQOS *slov1alpha1.ResourceQOSStrategy) { + mergeNoneCPUQOSIfDisabled(resourceQOS) + mergeNoneResctrlQOSIfDisabled(resourceQOS) + mergeNoneMemoryQOSIfDisabled(resourceQOS) + klog.V(5).Infof("get merged node ResourceQOS %v", util.DumpJSON(resourceQOS)) } -// mergeNoneResctrlQoSIfDisabled completes node's resctrl qos config according to Enable options in ResctrlQoS -func mergeNoneResctrlQoSIfDisabled(resourceQoS *slov1alpha1.ResourceQoSStrategy) { - if resourceQoS.LSR != nil && resourceQoS.LSR.ResctrlQoS != nil && - resourceQoS.LSR.ResctrlQoS.Enable != nil && !(*resourceQoS.LSR.ResctrlQoS.Enable) { - resourceQoS.LSR.ResctrlQoS.ResctrlQoS = *util.NoneResctrlQoS() +// mergeNoneResctrlQOSIfDisabled completes node's resctrl qos config according to Enable options in ResctrlQOS +func mergeNoneResctrlQOSIfDisabled(resourceQOS *slov1alpha1.ResourceQOSStrategy) { + if resourceQOS.LSRClass != nil && resourceQOS.LSRClass.ResctrlQOS != nil && + resourceQOS.LSRClass.ResctrlQOS.Enable != nil && !(*resourceQOS.LSRClass.ResctrlQOS.Enable) { + resourceQOS.LSRClass.ResctrlQOS.ResctrlQOS = *util.NoneResctrlQOS() } - if resourceQoS.LS != nil && resourceQoS.LS.ResctrlQoS != nil && - resourceQoS.LS.ResctrlQoS.Enable != nil && !(*resourceQoS.LS.ResctrlQoS.Enable) { - resourceQoS.LS.ResctrlQoS.ResctrlQoS = *util.NoneResctrlQoS() + if resourceQOS.LSClass != nil && resourceQOS.LSClass.ResctrlQOS != nil && + resourceQOS.LSClass.ResctrlQOS.Enable != nil && !(*resourceQOS.LSClass.ResctrlQOS.Enable) { + resourceQOS.LSClass.ResctrlQOS.ResctrlQOS = *util.NoneResctrlQOS() } - if resourceQoS.BE != nil && resourceQoS.BE.ResctrlQoS != nil && - resourceQoS.BE.ResctrlQoS.Enable != nil && !(*resourceQoS.BE.ResctrlQoS.Enable) { - resourceQoS.BE.ResctrlQoS.ResctrlQoS = *util.NoneResctrlQoS() + if resourceQOS.BEClass != nil && resourceQOS.BEClass.ResctrlQOS != nil && + resourceQOS.BEClass.ResctrlQOS.Enable != nil && !(*resourceQOS.BEClass.ResctrlQOS.Enable) { + resourceQOS.BEClass.ResctrlQOS.ResctrlQOS = *util.NoneResctrlQOS() } } -// mergeNoneMemoryQoSIfDisabled completes node's memory qos config according to Enable options in MemoryQoS -func mergeNoneMemoryQoSIfDisabled(resourceQoS *slov1alpha1.ResourceQoSStrategy) { - // if MemoryQoS.Enable=false, merge with NoneMemoryQoS - if resourceQoS.LSR != nil && resourceQoS.LSR.MemoryQoS != nil && - resourceQoS.LSR.MemoryQoS.Enable != nil && !(*resourceQoS.LSR.MemoryQoS.Enable) { - resourceQoS.LSR.MemoryQoS.MemoryQoS = *util.NoneMemoryQoS() +// mergeNoneMemoryQOSIfDisabled completes node's memory qos config according to Enable options in MemoryQOS +func mergeNoneMemoryQOSIfDisabled(resourceQOS *slov1alpha1.ResourceQOSStrategy) { + // if MemoryQOS.Enable=false, merge with NoneMemoryQOS + if resourceQOS.LSRClass != nil && resourceQOS.LSRClass.MemoryQOS != nil && + resourceQOS.LSRClass.MemoryQOS.Enable != nil && !(*resourceQOS.LSRClass.MemoryQOS.Enable) { + resourceQOS.LSRClass.MemoryQOS.MemoryQOS = *util.NoneMemoryQOS() } - if resourceQoS.LS != nil && resourceQoS.LS.MemoryQoS != nil && - resourceQoS.LS.MemoryQoS.Enable != nil && !(*resourceQoS.LS.MemoryQoS.Enable) { - resourceQoS.LS.MemoryQoS.MemoryQoS = *util.NoneMemoryQoS() + if resourceQOS.LSClass != nil && resourceQOS.LSClass.MemoryQOS != nil && + resourceQOS.LSClass.MemoryQOS.Enable != nil && !(*resourceQOS.LSClass.MemoryQOS.Enable) { + resourceQOS.LSClass.MemoryQOS.MemoryQOS = *util.NoneMemoryQOS() } - if resourceQoS.BE != nil && resourceQoS.BE.MemoryQoS != nil && - resourceQoS.BE.MemoryQoS.Enable != nil && !(*resourceQoS.BE.MemoryQoS.Enable) { - resourceQoS.BE.MemoryQoS.MemoryQoS = *util.NoneMemoryQoS() + if resourceQOS.BEClass != nil && resourceQOS.BEClass.MemoryQOS != nil && + resourceQOS.BEClass.MemoryQOS.Enable != nil && !(*resourceQOS.BEClass.MemoryQOS.Enable) { + resourceQOS.BEClass.MemoryQOS.MemoryQOS = *util.NoneMemoryQOS() } } -func mergeNoneCPUQoSIfDisabled(resourceQoS *slov1alpha1.ResourceQoSStrategy) { - // if CPUQoS.Enabled=false, merge with NoneCPUQoS - if resourceQoS.LSR != nil && resourceQoS.LSR.CPUQoS != nil && - resourceQoS.LSR.CPUQoS.Enable != nil && !(*resourceQoS.LSR.CPUQoS.Enable) { - resourceQoS.LSR.CPUQoS.CPUQoS = *util.NoneCPUQoS() +func mergeNoneCPUQOSIfDisabled(resourceQOS *slov1alpha1.ResourceQOSStrategy) { + // if CPUQOS.Enabled=false, merge with NoneCPUQOS + if resourceQOS.LSRClass != nil && resourceQOS.LSRClass.CPUQOS != nil && + resourceQOS.LSRClass.CPUQOS.Enable != nil && !(*resourceQOS.LSRClass.CPUQOS.Enable) { + resourceQOS.LSRClass.CPUQOS.CPUQOS = *util.NoneCPUQOS() } - if resourceQoS.LS != nil && resourceQoS.LS.CPUQoS != nil && - resourceQoS.LS.CPUQoS.Enable != nil && !(*resourceQoS.LS.CPUQoS.Enable) { - resourceQoS.LS.CPUQoS.CPUQoS = *util.NoneCPUQoS() + if resourceQOS.LSClass != nil && resourceQOS.LSClass.CPUQOS != nil && + resourceQOS.LSClass.CPUQOS.Enable != nil && !(*resourceQOS.LSClass.CPUQOS.Enable) { + resourceQOS.LSClass.CPUQOS.CPUQOS = *util.NoneCPUQOS() } - if resourceQoS.BE != nil && resourceQoS.BE.CPUQoS != nil && - resourceQoS.BE.CPUQoS.Enable != nil && !(*resourceQoS.BE.CPUQoS.Enable) { - resourceQoS.BE.CPUQoS.CPUQoS = *util.NoneCPUQoS() + if resourceQOS.BEClass != nil && resourceQOS.BEClass.CPUQOS != nil && + resourceQOS.BEClass.CPUQOS.Enable != nil && !(*resourceQOS.BEClass.CPUQOS.Enable) { + resourceQOS.BEClass.CPUQOS.CPUQOS = *util.NoneCPUQOS() } } diff --git a/pkg/koordlet/statesinformer/states_nodeslo_test.go b/pkg/koordlet/statesinformer/states_nodeslo_test.go index 28199b42e..94342da78 100644 --- a/pkg/koordlet/statesinformer/states_nodeslo_test.go +++ b/pkg/koordlet/statesinformer/states_nodeslo_test.go @@ -32,19 +32,19 @@ func Test_mergeNodeSLOSpec(t *testing.T) { ResourceUsedThresholdWithBE: &slov1alpha1.ResourceThresholdStrategy{ CPUSuppressThresholdPercent: pointer.Int64Ptr(80), }, - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: util.NoneResourceQoS(apiext.QoSLSR), - LS: util.NoneResourceQoS(apiext.QoSLS), - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: util.NoneResourceQOS(apiext.QoSLSR), + LSClass: util.NoneResourceQOS(apiext.QoSLS), + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.BoolPtr(true), }, - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), }, - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ Enable: pointer.BoolPtr(true), - ResctrlQoS: slov1alpha1.ResctrlQoS{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeEndPercent: pointer.Int64Ptr(50), }, }, @@ -111,26 +111,26 @@ func Test_mergeNodeSLOSpec(t *testing.T) { CPUSuppressThresholdPercent: pointer.Int64Ptr(100), MemoryEvictThresholdPercent: pointer.Int64Ptr(100), }, - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, }, }, - LS: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + LSClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), }, }, }, - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ - ResctrlQoS: slov1alpha1.ResctrlQoS{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(40), }, @@ -163,10 +163,10 @@ func Test_createNodeSLO(t *testing.T) { CPUSuppressThresholdPercent: pointer.Int64Ptr(80), } - testingNewNodeSLO.Spec.ResourceQoSStrategy.BE = &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ + testingNewNodeSLO.Spec.ResourceQOSStrategy.BEClass = &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ Enable: pointer.BoolPtr(true), - ResctrlQoS: slov1alpha1.ResctrlQoS{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(20), }, @@ -178,12 +178,12 @@ func Test_createNodeSLO(t *testing.T) { } testingCreatedNodeSLO.Spec.ResourceUsedThresholdWithBE.Enable = pointer.BoolPtr(true) testingCreatedNodeSLO.Spec.ResourceUsedThresholdWithBE.CPUSuppressThresholdPercent = pointer.Int64Ptr(80) - testingCreatedNodeSLO.Spec.ResourceQoSStrategy.LSR = util.NoneResourceQoS(apiext.QoSLSR) - testingCreatedNodeSLO.Spec.ResourceQoSStrategy.LS = util.NoneResourceQoS(apiext.QoSLS) - testingCreatedNodeSLO.Spec.ResourceQoSStrategy.BE = util.NoneResourceQoS(apiext.QoSBE) - testingCreatedNodeSLO.Spec.ResourceQoSStrategy.BE.ResctrlQoS.Enable = pointer.BoolPtr(true) - testingCreatedNodeSLO.Spec.ResourceQoSStrategy.BE.ResctrlQoS.CATRangeStartPercent = pointer.Int64Ptr(0) - testingCreatedNodeSLO.Spec.ResourceQoSStrategy.BE.ResctrlQoS.CATRangeEndPercent = pointer.Int64Ptr(20) + testingCreatedNodeSLO.Spec.ResourceQOSStrategy.LSRClass = util.NoneResourceQOS(apiext.QoSLSR) + testingCreatedNodeSLO.Spec.ResourceQOSStrategy.LSClass = util.NoneResourceQOS(apiext.QoSLS) + testingCreatedNodeSLO.Spec.ResourceQOSStrategy.BEClass = util.NoneResourceQOS(apiext.QoSBE) + testingCreatedNodeSLO.Spec.ResourceQOSStrategy.BEClass.ResctrlQOS.Enable = pointer.BoolPtr(true) + testingCreatedNodeSLO.Spec.ResourceQOSStrategy.BEClass.ResctrlQOS.CATRangeStartPercent = pointer.Int64Ptr(0) + testingCreatedNodeSLO.Spec.ResourceQOSStrategy.BEClass.ResctrlQOS.CATRangeEndPercent = pointer.Int64Ptr(20) r := statesInformer{nodeSLO: nil} @@ -198,11 +198,11 @@ func Test_updateNodeSLOSpec(t *testing.T) { Enable: pointer.BoolPtr(true), CPUSuppressThresholdPercent: pointer.Int64Ptr(80), }, - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ Enable: pointer.BoolPtr(true), - ResctrlQoS: slov1alpha1.ResctrlQoS{ + ResctrlQOS: slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(20), }, @@ -216,19 +216,19 @@ func Test_updateNodeSLOSpec(t *testing.T) { } testingUpdatedNodeSLO.Spec.ResourceUsedThresholdWithBE.Enable = pointer.BoolPtr(true) testingUpdatedNodeSLO.Spec.ResourceUsedThresholdWithBE.CPUSuppressThresholdPercent = pointer.Int64Ptr(80) - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.LSR.CPUQoS.CPUQoS = *util.NoneCPUQoS() - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.LSR.MemoryQoS.MemoryQoS = *util.NoneMemoryQoS() - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.LSR.ResctrlQoS.ResctrlQoS = *util.NoneResctrlQoS() + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.LSRClass.CPUQOS.CPUQOS = *util.NoneCPUQOS() + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.LSRClass.MemoryQOS.MemoryQOS = *util.NoneMemoryQOS() + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.LSRClass.ResctrlQOS.ResctrlQOS = *util.NoneResctrlQOS() - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.LS.CPUQoS.CPUQoS = *util.NoneCPUQoS() - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.LS.MemoryQoS.MemoryQoS = *util.NoneMemoryQoS() - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.LS.ResctrlQoS.ResctrlQoS = *util.NoneResctrlQoS() + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.LSClass.CPUQOS.CPUQOS = *util.NoneCPUQOS() + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.LSClass.MemoryQOS.MemoryQOS = *util.NoneMemoryQOS() + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.LSClass.ResctrlQOS.ResctrlQOS = *util.NoneResctrlQOS() - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.BE.CPUQoS.CPUQoS = *util.NoneCPUQoS() - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.BE.MemoryQoS.MemoryQoS = *util.NoneMemoryQoS() - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.BE.ResctrlQoS.Enable = pointer.BoolPtr(true) - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.BE.ResctrlQoS.CATRangeStartPercent = pointer.Int64Ptr(0) - testingUpdatedNodeSLO.Spec.ResourceQoSStrategy.BE.ResctrlQoS.CATRangeEndPercent = pointer.Int64Ptr(20) + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.BEClass.CPUQOS.CPUQOS = *util.NoneCPUQOS() + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.BEClass.MemoryQOS.MemoryQOS = *util.NoneMemoryQOS() + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.BEClass.ResctrlQOS.Enable = pointer.BoolPtr(true) + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.BEClass.ResctrlQOS.CATRangeStartPercent = pointer.Int64Ptr(0) + testingUpdatedNodeSLO.Spec.ResourceQOSStrategy.BEClass.ResctrlQOS.CATRangeEndPercent = pointer.Int64Ptr(20) r := statesInformer{ nodeSLO: &slov1alpha1.NodeSLO{ @@ -324,17 +324,17 @@ func Test_mergeSLOSpecResourceUsedThresholdWithBE(t *testing.T) { } } -func Test_mergeSLOSpecResourceQoSStrategy(t *testing.T) { - testingDefaultSpec := util.DefaultResourceQoSStrategy() +func Test_mergeSLOSpecResourceQOSStrategy(t *testing.T) { + testingDefaultSpec := util.DefaultResourceQOSStrategy() testingNewSpec := testingDefaultSpec.DeepCopy() - testingNewSpec.BE.MemoryQoS.WmarkRatio = pointer.Int64Ptr(0) + testingNewSpec.BEClass.MemoryQOS.WmarkRatio = pointer.Int64Ptr(0) - testingNewSpec1 := &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + testingNewSpec1 := &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(true), - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: slov1alpha1.MemoryQOS{ WmarkRatio: pointer.Int64Ptr(90), }, }, @@ -342,30 +342,30 @@ func Test_mergeSLOSpecResourceQoSStrategy(t *testing.T) { } testingMergedSpec := testingDefaultSpec.DeepCopy() - testingMergedSpec.BE.MemoryQoS.Enable = pointer.BoolPtr(true) - testingMergedSpec.BE.MemoryQoS.WmarkRatio = pointer.Int64Ptr(90) + testingMergedSpec.BEClass.MemoryQOS.Enable = pointer.BoolPtr(true) + testingMergedSpec.BEClass.MemoryQOS.WmarkRatio = pointer.Int64Ptr(90) type args struct { - defaultSpec *slov1alpha1.ResourceQoSStrategy - newSpec *slov1alpha1.ResourceQoSStrategy + defaultSpec *slov1alpha1.ResourceQOSStrategy + newSpec *slov1alpha1.ResourceQOSStrategy } tests := []struct { name string args args - want *slov1alpha1.ResourceQoSStrategy + want *slov1alpha1.ResourceQOSStrategy }{ { name: "both empty", args: args{ - defaultSpec: &slov1alpha1.ResourceQoSStrategy{}, - newSpec: &slov1alpha1.ResourceQoSStrategy{}, + defaultSpec: &slov1alpha1.ResourceQOSStrategy{}, + newSpec: &slov1alpha1.ResourceQOSStrategy{}, }, - want: &slov1alpha1.ResourceQoSStrategy{}, + want: &slov1alpha1.ResourceQOSStrategy{}, }, { name: "totally use new", args: args{ - defaultSpec: &slov1alpha1.ResourceQoSStrategy{}, + defaultSpec: &slov1alpha1.ResourceQOSStrategy{}, newSpec: testingNewSpec, }, want: testingNewSpec, @@ -396,21 +396,21 @@ func Test_mergeSLOSpecResourceQoSStrategy(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := mergeSLOSpecResourceQoSStrategy(tt.args.defaultSpec, tt.args.newSpec) + got := mergeSLOSpecResourceQOSStrategy(tt.args.defaultSpec, tt.args.newSpec) assert.Equal(t, tt.want, got) }) } } -func Test_mergeNoneResourceQoSIfDisabled(t *testing.T) { - testDefault := util.DefaultResourceQoSStrategy() - testAllNone := util.NoneResourceQoSStrategy() +func Test_mergeNoneResourceQOSIfDisabled(t *testing.T) { + testDefault := util.DefaultResourceQOSStrategy() + testAllNone := util.NoneResourceQOSStrategy() testLSMemQOSEnabled := testDefault.DeepCopy() - testLSMemQOSEnabled.LS.MemoryQoS.Enable = pointer.BoolPtr(true) - testLSMemQOSEnabledResult := util.NoneResourceQoSStrategy() - testLSMemQOSEnabledResult.LS.MemoryQoS.Enable = pointer.BoolPtr(true) - testLSMemQOSEnabledResult.LS.MemoryQoS.MemoryQoS = *util.DefaultMemoryQoS(apiext.QoSLS) + testLSMemQOSEnabled.LSClass.MemoryQOS.Enable = pointer.BoolPtr(true) + testLSMemQOSEnabledResult := util.NoneResourceQOSStrategy() + testLSMemQOSEnabledResult.LSClass.MemoryQOS.Enable = pointer.BoolPtr(true) + testLSMemQOSEnabledResult.LSClass.MemoryQOS.MemoryQOS = *util.DefaultMemoryQOS(apiext.QoSLS) type args struct { nodeCfg *slov1alpha1.NodeSLO @@ -418,14 +418,14 @@ func Test_mergeNoneResourceQoSIfDisabled(t *testing.T) { tests := []struct { name string args args - want *slov1alpha1.ResourceQoSStrategy + want *slov1alpha1.ResourceQOSStrategy }{ { name: "all disabled", args: args{ nodeCfg: &slov1alpha1.NodeSLO{ Spec: slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: testDefault, + ResourceQOSStrategy: testDefault, }, }, }, @@ -436,7 +436,7 @@ func Test_mergeNoneResourceQoSIfDisabled(t *testing.T) { args: args{ nodeCfg: &slov1alpha1.NodeSLO{ Spec: slov1alpha1.NodeSLOSpec{ - ResourceQoSStrategy: testLSMemQOSEnabled, + ResourceQOSStrategy: testLSMemQOSEnabled, }, }, }, @@ -445,8 +445,8 @@ func Test_mergeNoneResourceQoSIfDisabled(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - mergeNoneResourceQoSIfDisabled(tt.args.nodeCfg.Spec.ResourceQoSStrategy) - assert.Equal(t, tt.want, tt.args.nodeCfg.Spec.ResourceQoSStrategy) + mergeNoneResourceQOSIfDisabled(tt.args.nodeCfg.Spec.ResourceQOSStrategy) + assert.Equal(t, tt.want, tt.args.nodeCfg.Spec.ResourceQOSStrategy) }) } } diff --git a/pkg/koordlet/statesinformer/states_pod.go b/pkg/koordlet/statesinformer/states_pod.go index 920c6aafd..4fa34e20b 100644 --- a/pkg/koordlet/statesinformer/states_pod.go +++ b/pkg/koordlet/statesinformer/states_pod.go @@ -45,7 +45,7 @@ func (s *statesInformer) syncPods() error { s.podHasSynced.Store(true) s.podUpdatedTime = time.Now() klog.Infof("get pods success, len %d", len(s.podMap)) - s.sendCallbacks(RegisterTypePod) + s.sendCallbacks(RegisterTypeAllPods) return nil } diff --git a/pkg/runtimeproxy/dispatcher/hookclient.go b/pkg/runtimeproxy/dispatcher/hookclient.go index c124cef42..fd63eada7 100644 --- a/pkg/runtimeproxy/dispatcher/hookclient.go +++ b/pkg/runtimeproxy/dispatcher/hookclient.go @@ -17,7 +17,6 @@ limitations under the License. package dispatcher import ( - "encoding/json" "fmt" "github.com/golang/groupcache/lru" @@ -69,11 +68,7 @@ func newRuntimeHookClient(sockPath string) (*RuntimeHookClient, error) { } func (cm *HookServerClientManager) RuntimeHookServerClient(serverPath HookServerPath) (*RuntimeHookClient, error) { - cacheKey, err := json.Marshal(serverPath) - if err != nil { - return nil, err - } - if client, ok := cm.cache.Get(string(cacheKey)); ok { + if client, ok := cm.cache.Get(serverPath); ok { return client.(*RuntimeHookClient), nil } @@ -82,6 +77,6 @@ func (cm *HookServerClientManager) RuntimeHookServerClient(serverPath HookServer klog.Errorf("fail to create client %v", err) return nil, err } - cm.cache.Add(string(cacheKey), runtimeHookClient) + cm.cache.Add(serverPath, runtimeHookClient) return runtimeHookClient, nil } diff --git a/pkg/runtimeproxy/resexecutor/cri/container.go b/pkg/runtimeproxy/resexecutor/cri/container.go index 891615f85..4da221273 100644 --- a/pkg/runtimeproxy/resexecutor/cri/container.go +++ b/pkg/runtimeproxy/resexecutor/cri/container.go @@ -74,8 +74,10 @@ func (c *ContainerResourceExecutor) ParseRequest(req interface{}) error { } c.ContainerInfo = store.ContainerInfo{ ContainerResourceHookRequest: &v1alpha1.ContainerResourceHookRequest{ - PodMeta: podCheckPoint.PodMeta, - PodResources: podCheckPoint.Resources, + PodMeta: podCheckPoint.PodMeta, + PodResources: podCheckPoint.Resources, + PodAnnotations: podCheckPoint.Annotations, + PodLabels: podCheckPoint.Labels, ContainerMata: &v1alpha1.ContainerMetadata{ Name: request.GetConfig().GetMetadata().GetName(), Attempt: request.GetConfig().GetMetadata().GetAttempt(), @@ -89,7 +91,12 @@ func (c *ContainerResourceExecutor) ParseRequest(req interface{}) error { case *runtimeapi.StartContainerRequest: return c.loadContainerInfoFromStore(request.GetContainerId(), "StartContainer") case *runtimeapi.UpdateContainerResourcesRequest: - return c.loadContainerInfoFromStore(request.GetContainerId(), "UpdateContainerResource") + err := c.loadContainerInfoFromStore(request.GetContainerId(), "UpdateContainerResource") + if err != nil { + return err + } + c.ContainerResources = updateResourceByUpdateContainerResourceRequest(c.ContainerResources, transferToKoordResources(request.Linux)) + return nil case *runtimeapi.StopContainerRequest: return c.loadContainerInfoFromStore(request.GetContainerId(), "StopContainer") } diff --git a/pkg/runtimeproxy/resexecutor/cri/container_test.go b/pkg/runtimeproxy/resexecutor/cri/container_test.go index 30cc14970..42dbf9fec 100644 --- a/pkg/runtimeproxy/resexecutor/cri/container_test.go +++ b/pkg/runtimeproxy/resexecutor/cri/container_test.go @@ -267,3 +267,199 @@ func TestContainerResourceExecutor_ResourceCheckPoint(t *testing.T) { assert.Equal(t, tt.wantStoreInfo, containerInfo) } } + +func TestContainerResourceExecutor_ParseRequest_CreateContainerRequest(t *testing.T) { + type args struct { + podReq interface{} + containerReq interface{} + } + tests := []struct { + name string + args args + wantContainerExecutor store.ContainerInfo + }{ + { + name: "normal case", + args: args{ + podReq: &runtimeapi.RunPodSandboxRequest{ + Config: &runtimeapi.PodSandboxConfig{ + Metadata: &runtimeapi.PodSandboxMetadata{ + Name: "mock pod sandbox", + Namespace: "mock namespace", + Uid: "202207121604", + }, + Annotations: map[string]string{ + "annotation.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Pod": "true", + }, + Labels: map[string]string{ + "label.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Pod": "true", + }, + Linux: &runtimeapi.LinuxPodSandboxConfig{ + CgroupParent: "/kubepods/besteffort", + }, + }, + }, + containerReq: &runtimeapi.CreateContainerRequest{ + PodSandboxId: "202207121604", + Config: &runtimeapi.ContainerConfig{ + Metadata: &runtimeapi.ContainerMetadata{ + Name: "test container", + Attempt: 101010, + }, + Annotations: map[string]string{ + "annotation.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Container": "true", + }, + Labels: map[string]string{ + "label.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Container": "true", + }, + Linux: &runtimeapi.LinuxContainerConfig{ + Resources: &runtimeapi.LinuxContainerResources{ + CpuPeriod: 1000, + CpuShares: 500, + OomScoreAdj: 10, + Unified: map[string]string{ + "resourceA": "resource A", + }, + }, + }, + }, + SandboxConfig: &runtimeapi.PodSandboxConfig{ + Linux: &runtimeapi.LinuxPodSandboxConfig{ + CgroupParent: "/kubepods/besteffort", + }, + }, + }, + }, + wantContainerExecutor: store.ContainerInfo{ + ContainerResourceHookRequest: &v1alpha1.ContainerResourceHookRequest{ + PodMeta: &v1alpha1.PodSandboxMetadata{ + Name: "mock pod sandbox", + Namespace: "mock namespace", + Uid: "202207121604", + }, + PodLabels: map[string]string{ + "label.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Pod": "true", + }, + PodAnnotations: map[string]string{ + "annotation.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Pod": "true", + }, + ContainerMata: &v1alpha1.ContainerMetadata{ + Name: "test container", + Attempt: 101010, + }, + ContainerAnnotations: map[string]string{ + "annotation.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Container": "true", + }, + ContainerResources: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 1000, + CpuShares: 500, + OomScoreAdj: 10, + Unified: map[string]string{ + "resourceA": "resource A", + }, + }, + PodCgroupParent: "/kubepods/besteffort", + }, + }, + }, + } + for _, tt := range tests { + // mock pod cache + p := NewPodResourceExecutor() + _ = p.ParseRequest(tt.args.podReq) + _ = store.WritePodSandboxInfo("202207121604", &p.PodSandboxInfo) + + // write container cache + c := NewContainerResourceExecutor() + _ = c.ParseRequest(tt.args.containerReq) + + // check if container cache is set correctly + assert.Equal(t, tt.wantContainerExecutor, c.ContainerInfo) + } +} + +func TestContainerResourceExecutor_ParseRequest_UpdateContainerResourcesRequest(t *testing.T) { + type args struct { + containerID string + containerReq interface{} + ExistingContainerExecutor store.ContainerInfo + } + tests := []struct { + name string + args args + wantContainerInfo store.ContainerInfo + }{ + { + name: "normal case", + args: args{ + containerID: "10101010", + containerReq: &runtimeapi.UpdateContainerResourcesRequest{ + ContainerId: "10101010", + Linux: &runtimeapi.LinuxContainerResources{ + CpusetCpus: "0-31", + }, + }, + ExistingContainerExecutor: store.ContainerInfo{ + ContainerResourceHookRequest: &v1alpha1.ContainerResourceHookRequest{ + PodMeta: &v1alpha1.PodSandboxMetadata{ + Name: "mock pod sandbox", + Namespace: "mock namespace", + Uid: "202207121604", + }, + ContainerMata: &v1alpha1.ContainerMetadata{ + Name: "test container", + Attempt: 101010, + }, + ContainerAnnotations: map[string]string{ + "annotation.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Container": "true", + }, + ContainerResources: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 1000, + CpuShares: 500, + OomScoreAdj: 10, + Unified: map[string]string{ + "resourceA": "resource A", + }, + }, + PodCgroupParent: "/kubepods/besteffort", + }, + }, + }, + wantContainerInfo: store.ContainerInfo{ + ContainerResourceHookRequest: &v1alpha1.ContainerResourceHookRequest{ + PodMeta: &v1alpha1.PodSandboxMetadata{ + Name: "mock pod sandbox", + Namespace: "mock namespace", + Uid: "202207121604", + }, + ContainerMata: &v1alpha1.ContainerMetadata{ + Name: "test container", + Attempt: 101010, + }, + ContainerAnnotations: map[string]string{ + "annotation.dummy.koordinator.sh/TestContainerResourceExecutor_ParseRequest_CreateContainerRequest_Container": "true", + }, + ContainerResources: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 1000, + CpuShares: 500, + OomScoreAdj: 10, + CpusetCpus: "0-31", + Unified: map[string]string{ + "resourceA": "resource A", + }, + }, + PodCgroupParent: "/kubepods/besteffort", + }, + }, + }, + } + for _, tt := range tests { + c := NewContainerResourceExecutor() + // mock container cache + _ = store.WriteContainerInfo(tt.args.containerID, &tt.args.ExistingContainerExecutor) + _ = c.ParseRequest(tt.args.containerReq) + + // check if container cache is set correctly + assert.Equal(t, tt.wantContainerInfo, c.ContainerInfo) + } +} diff --git a/pkg/runtimeproxy/resexecutor/cri/pod.go b/pkg/runtimeproxy/resexecutor/cri/pod.go index 640479e79..bad0fe1ab 100644 --- a/pkg/runtimeproxy/resexecutor/cri/pod.go +++ b/pkg/runtimeproxy/resexecutor/cri/pod.go @@ -67,6 +67,7 @@ func (p *PodResourceExecutor) ParseRequest(req interface{}) error { PodMeta: &v1alpha1.PodSandboxMetadata{ Name: request.GetConfig().GetMetadata().GetName(), Namespace: request.GetConfig().GetMetadata().GetNamespace(), + Uid: request.GetConfig().GetMetadata().GetUid(), }, RuntimeHandler: request.GetRuntimeHandler(), Annotations: request.GetConfig().GetAnnotations(), @@ -90,6 +91,7 @@ func (p *PodResourceExecutor) ParsePod(podsandbox *runtimeapi.PodSandbox) error PodMeta: &v1alpha1.PodSandboxMetadata{ Name: podsandbox.GetMetadata().GetName(), Namespace: podsandbox.GetMetadata().GetNamespace(), + Uid: podsandbox.GetMetadata().GetUid(), }, RuntimeHandler: podsandbox.GetRuntimeHandler(), Annotations: podsandbox.GetAnnotations(), diff --git a/pkg/runtimeproxy/resexecutor/cri/utils.go b/pkg/runtimeproxy/resexecutor/cri/utils.go index 610d1f9b5..aedfc01a1 100644 --- a/pkg/runtimeproxy/resexecutor/cri/utils.go +++ b/pkg/runtimeproxy/resexecutor/cri/utils.go @@ -81,7 +81,7 @@ func updateResource(a, b *v1alpha1.LinuxContainerResources) *v1alpha1.LinuxConta a.CpuShares = b.CpuShares } if b.MemoryLimitInBytes > 0 { - a.MemoryLimitInBytes = b.MemorySwapLimitInBytes + a.MemoryLimitInBytes = b.MemoryLimitInBytes } if b.OomScoreAdj >= -1000 && b.OomScoreAdj <= 1000 { a.OomScoreAdj = b.OomScoreAdj @@ -98,3 +98,37 @@ func updateResource(a, b *v1alpha1.LinuxContainerResources) *v1alpha1.LinuxConta } return a } + +// updateResourceByUpdateContainerResourceRequest updates resources in cache by UpdateContainerResource request. +// updateResourceByUpdateContainerResourceRequest will omit OomScoreAdj. +// +// Normally kubelet won't send UpdateContainerResource request, so if some components want to send it and want to update OomScoreAdj, +// please use hook to achieve it. +func updateResourceByUpdateContainerResourceRequest(a, b *v1alpha1.LinuxContainerResources) *v1alpha1.LinuxContainerResources { + if a == nil || b == nil { + return a + } + if b.CpuPeriod > 0 { + a.CpuPeriod = b.CpuPeriod + } + if b.CpuQuota > 0 { + a.CpuQuota = b.CpuQuota + } + if b.CpuShares > 0 { + a.CpuShares = b.CpuShares + } + if b.MemoryLimitInBytes > 0 { + a.MemoryLimitInBytes = b.MemoryLimitInBytes + } + if b.CpusetCpus != "" { + a.CpusetCpus = b.CpusetCpus + } + if b.CpusetMems != "" { + a.CpusetMems = b.CpusetMems + } + a.Unified = utils.MergeMap(a.Unified, b.Unified) + if b.MemorySwapLimitInBytes > 0 { + a.MemorySwapLimitInBytes = b.MemorySwapLimitInBytes + } + return a +} diff --git a/pkg/runtimeproxy/resexecutor/cri/utils_test.go b/pkg/runtimeproxy/resexecutor/cri/utils_test.go index 8592298de..26ff4f98e 100644 --- a/pkg/runtimeproxy/resexecutor/cri/utils_test.go +++ b/pkg/runtimeproxy/resexecutor/cri/utils_test.go @@ -48,26 +48,41 @@ func Test_updateResource(t *testing.T) { name: "normal case", args: args{ a: &v1alpha1.LinuxContainerResources{ - CpuPeriod: 1000, - CpuShares: 500, - OomScoreAdj: 10, + CpuPeriod: 1000, + CpuQuota: 2000, + CpuShares: 500, + OomScoreAdj: 10, + MemorySwapLimitInBytes: 100, + MemoryLimitInBytes: 300, + CpusetCpus: "0-64", + CpusetMems: "0-2", Unified: map[string]string{ "resourceA": "resource A", }, }, b: &v1alpha1.LinuxContainerResources{ - CpuPeriod: 2000, - CpuShares: 1000, - OomScoreAdj: 20, + CpuPeriod: 2000, + CpuQuota: 4000, + CpuShares: 1000, + OomScoreAdj: 20, + MemorySwapLimitInBytes: 200, + MemoryLimitInBytes: 600, + CpusetCpus: "0-31", + CpusetMems: "0-4", Unified: map[string]string{ "resourceB": "resource B", }, }, }, want: &v1alpha1.LinuxContainerResources{ - CpuPeriod: 2000, - CpuShares: 1000, - OomScoreAdj: 20, + CpuPeriod: 2000, + CpuQuota: 4000, + CpuShares: 1000, + OomScoreAdj: 20, + MemorySwapLimitInBytes: 200, + MemoryLimitInBytes: 600, + CpusetCpus: "0-31", + CpusetMems: "0-4", Unified: map[string]string{ "resourceA": "resource A", "resourceB": "resource B", @@ -154,3 +169,74 @@ func Test_transferToCRIResources(t *testing.T) { assert.Equal(t, tt.want, gotResources) } } + +func Test_updateResourceByUpdateContainerResourceRequest(t *testing.T) { + type args struct { + a *v1alpha1.LinuxContainerResources + b *v1alpha1.LinuxContainerResources + } + tests := []struct { + name string + args args + want *v1alpha1.LinuxContainerResources + }{ + { + name: "a and b are both nil", + args: args{ + a: nil, + b: nil, + }, + want: nil, + }, + { + name: "normal case", + args: args{ + a: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 1000, + CpuQuota: 2000, + CpuShares: 500, + OomScoreAdj: 10, + MemorySwapLimitInBytes: 100, + MemoryLimitInBytes: 300, + CpusetCpus: "0-64", + CpusetMems: "0-2", + Unified: map[string]string{ + "resourceA": "resource A", + }, + }, + b: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 2000, + CpuQuota: 4000, + CpuShares: 1000, + OomScoreAdj: 20, + MemorySwapLimitInBytes: 200, + MemoryLimitInBytes: 600, + CpusetCpus: "0-31", + CpusetMems: "0-4", + Unified: map[string]string{ + "resourceB": "resource B", + }, + }, + }, + want: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 2000, + CpuQuota: 4000, + CpuShares: 1000, + OomScoreAdj: 10, + MemorySwapLimitInBytes: 200, + MemoryLimitInBytes: 600, + CpusetCpus: "0-31", + CpusetMems: "0-4", + Unified: map[string]string{ + "resourceA": "resource A", + "resourceB": "resource B", + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equalf(t, tt.want, updateResourceByUpdateContainerResourceRequest(tt.args.a, tt.args.b), "updateResourceByUpdateContainerResourceRequest(%v, %v)", tt.args.a, tt.args.b) + }) + } +} diff --git a/pkg/runtimeproxy/server/cri/criserver.go b/pkg/runtimeproxy/server/cri/criserver.go index 5398b39ac..48e6e3710 100644 --- a/pkg/runtimeproxy/server/cri/criserver.go +++ b/pkg/runtimeproxy/server/cri/criserver.go @@ -105,8 +105,12 @@ func (c *RuntimeManagerCriServer) interceptRuntimeRequest(serviceType RuntimeSer // pre call hook server // TODO deal with the Dispatch response - if response, err := c.hookDispatcher.Dispatch(ctx, runtimeHookPath, config.PreHook, resourceExecutor.GenerateHookRequest()); err != nil { + response, err := c.hookDispatcher.Dispatch(ctx, runtimeHookPath, config.PreHook, resourceExecutor.GenerateHookRequest()) + if err != nil { klog.Errorf("fail to call hook server %v", err) + } else if response == nil { + // when hook is not registered, the response will become nil + klog.Warningf("runtime hook path %s does not register any PreHooks", string(runtimeHookPath)) } else { if err = resourceExecutor.UpdateRequest(response, request); err != nil { klog.Errorf("failed to update cri request %v", err) diff --git a/pkg/runtimeproxy/server/docker/handler.go b/pkg/runtimeproxy/server/docker/handler.go index 2e579c9cb..2abbf8bd0 100644 --- a/pkg/runtimeproxy/server/docker/handler.go +++ b/pkg/runtimeproxy/server/docker/handler.go @@ -263,6 +263,13 @@ func (d *RuntimeManagerDockerServer) HandleUpdateContainer(ctx context.Context, hookReq = containerMeta.GetContainerResourceHookRequest() } + // update resources in cache with UpdateConfig + if containerConfig != nil && hookReq != nil { + if updateReq, ok := hookReq.(*v1alpha1.ContainerResourceHookRequest); ok { + updateReq.ContainerResources = MergeResourceByUpdateConfig(updateReq.ContainerResources, containerConfig) + } + } + response, err := d.dispatcher.Dispatch(ctx, runtimeHookPath, config.PreHook, hookReq) if err != nil { klog.Errorf("Failed to call pre update hook server %v", err) diff --git a/pkg/runtimeproxy/server/docker/server.go b/pkg/runtimeproxy/server/docker/server.go index cc554949b..ae029ec06 100644 --- a/pkg/runtimeproxy/server/docker/server.go +++ b/pkg/runtimeproxy/server/docker/server.go @@ -131,6 +131,7 @@ func (d *RuntimeManagerDockerServer) failOver(dockerClient proxyDockerClient) er CgroupParent: s.ContainerJSON.HostConfig.CgroupParent, PodMeta: &v1alpha1.PodSandboxMetadata{ Name: s.Name, + Uid: s.ID, }, RuntimeHandler: "Docker", Resources: HostConfigToResource(s.ContainerJSON.HostConfig), diff --git a/pkg/runtimeproxy/server/docker/utils.go b/pkg/runtimeproxy/server/docker/utils.go index 9d98a8138..b0ae06c23 100644 --- a/pkg/runtimeproxy/server/docker/utils.go +++ b/pkg/runtimeproxy/server/docker/utils.go @@ -188,6 +188,34 @@ func UpdateUpdateConfigByResource(containerConfig *container.UpdateConfig, resou return containerConfig } +func MergeResourceByUpdateConfig(resources *v1alpha1.LinuxContainerResources, containerConfig *container.UpdateConfig) *v1alpha1.LinuxContainerResources { + if containerConfig == nil || resources == nil { + return resources + } + if containerConfig.CPUPeriod > 0 { + resources.CpuPeriod = containerConfig.CPUPeriod + } + if containerConfig.CPUQuota > 0 { + resources.CpuQuota = containerConfig.CPUQuota + } + if containerConfig.CPUShares > 0 { + resources.CpuShares = containerConfig.CPUShares + } + if containerConfig.Memory > 0 { + resources.MemoryLimitInBytes = containerConfig.Memory + } + if containerConfig.CpusetCpus != "" { + resources.CpusetCpus = containerConfig.CpusetCpus + } + if containerConfig.CpusetMems != "" { + resources.CpusetMems = containerConfig.CpusetMems + } + if containerConfig.MemorySwap > 0 { + resources.MemorySwapLimitInBytes = containerConfig.MemorySwap + } + return resources +} + // generateExpectedCgroupParent is adapted from Dockershim func generateExpectedCgroupParent(cgroupDriver, cgroupParent string) string { if cgroupParent != "" { diff --git a/pkg/runtimeproxy/server/docker/utils_test.go b/pkg/runtimeproxy/server/docker/utils_test.go index 089f4e900..21f44676b 100644 --- a/pkg/runtimeproxy/server/docker/utils_test.go +++ b/pkg/runtimeproxy/server/docker/utils_test.go @@ -316,3 +316,89 @@ func Test_generateExpectedCgroupParent(t *testing.T) { assert.Equal(t, tt.expectedCgroupParent, currentCgroupParent) } } + +func TestMergeResourceByUpdateConfig(t *testing.T) { + type args struct { + resources *v1alpha1.LinuxContainerResources + containerConfig *container.UpdateConfig + } + tests := []struct { + name string + args args + want *v1alpha1.LinuxContainerResources + }{ + { + name: "nil config", + args: args{ + resources: nil, + containerConfig: nil, + }, + want: nil, + }, + { + name: "normal case", + args: args{ + containerConfig: &container.UpdateConfig{ + Resources: container.Resources{ + CPUShares: 100, + CPUPeriod: 200, + CPUQuota: 300, + CpusetCpus: "0-64", + CpusetMems: "0-2", + Memory: 400, + MemorySwap: 500, + }, + }, + resources: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 20, + CpuQuota: 30, + CpuShares: 10, + MemoryLimitInBytes: 40, + OomScoreAdj: -998, + CpusetCpus: "", + CpusetMems: "", + MemorySwapLimitInBytes: 50, + }, + }, + want: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 200, + CpuQuota: 300, + CpuShares: 100, + MemoryLimitInBytes: 400, + OomScoreAdj: -998, + CpusetCpus: "0-64", + CpusetMems: "0-2", + MemorySwapLimitInBytes: 500, + }, + }, + { + name: "UpdateConfig only has non-trivial cpuset field", + args: args{ + resources: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 10000, + CpuQuota: 10000, + CpuShares: 1000, + OomScoreAdj: -998, + CpusetCpus: "0-63", + MemorySwapLimitInBytes: 1000, + }, + containerConfig: &container.UpdateConfig{ + Resources: container.Resources{ + CpusetCpus: "0-31", + }, + }, + }, + want: &v1alpha1.LinuxContainerResources{ + CpuPeriod: 10000, + CpuQuota: 10000, + CpuShares: 1000, + OomScoreAdj: -998, + CpusetCpus: "0-31", + MemorySwapLimitInBytes: 1000, + }, + }, + } + for _, tt := range tests { + assert.Equal(t, tt.want, MergeResourceByUpdateConfig(tt.args.resources, tt.args.containerConfig)) + } +} diff --git a/pkg/scheduler/frameworkext/framework_extender.go b/pkg/scheduler/frameworkext/framework_extender.go index 36946c39c..caa4d129b 100644 --- a/pkg/scheduler/frameworkext/framework_extender.go +++ b/pkg/scheduler/frameworkext/framework_extender.go @@ -17,10 +17,13 @@ limitations under the License. package frameworkext import ( + "context" "sync" nrtinformers "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/informers/externalversions" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" @@ -96,6 +99,96 @@ func (ext *frameworkExtendedHandleImpl) NodeResourceTopologySharedInformerFactor return ext.nrtSharedInformerFactory } +type FrameworkExtender interface { + framework.Framework +} + +type FrameworkExtenderFactory interface { + New(f framework.Framework) FrameworkExtender +} + +type SchedulingPhaseHook interface { + Name() string +} + +type PreFilterPhaseHook interface { + SchedulingPhaseHook + PreFilterHook(handle ExtendedHandle, state *framework.CycleState, pod *corev1.Pod) (*corev1.Pod, bool) +} + +type FilterPhaseHook interface { + SchedulingPhaseHook + FilterHook(handle ExtendedHandle, cycleState *framework.CycleState, pod *corev1.Pod, nodeInfo *framework.NodeInfo) (*corev1.Pod, *framework.NodeInfo, bool) +} + +type frameworkExtenderFactoryImpl struct { + handle ExtendedHandle + + // extend framework with SchedulingPhaseHook + preFilterHooks []PreFilterPhaseHook + filterHooks []FilterPhaseHook +} + +func NewFrameworkExtenderFactory(handle ExtendedHandle, hooks ...SchedulingPhaseHook) FrameworkExtenderFactory { + i := &frameworkExtenderFactoryImpl{ + handle: handle, + } + for _, h := range hooks { + // a hook may register in multiple phases + preFilter, ok := h.(PreFilterPhaseHook) + if ok { + i.preFilterHooks = append(i.preFilterHooks, preFilter) + } + filter, ok := h.(FilterPhaseHook) + if ok { + i.filterHooks = append(i.filterHooks, filter) + } + } + return i +} + +func (i *frameworkExtenderFactoryImpl) New(f framework.Framework) FrameworkExtender { + return &frameworkExtenderImpl{ + Framework: f, + handle: i.handle, + preFilterHooks: i.preFilterHooks, + filterHooks: i.filterHooks, + } +} + +var _ framework.Framework = &frameworkExtenderImpl{} + +type frameworkExtenderImpl struct { + framework.Framework + handle ExtendedHandle + + preFilterHooks []PreFilterPhaseHook + filterHooks []FilterPhaseHook +} + +func (ext *frameworkExtenderImpl) RunPreFilterPlugins(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod) *framework.Status { + for _, hook := range ext.preFilterHooks { + newPod, hooked := hook.PreFilterHook(ext.handle, cycleState, pod) + if hooked { + klog.V(5).InfoS("RunPreFilterPlugins hooked", "meet PreFilterPhaseHook", "hook", hook.Name(), "pod", klog.KObj(pod)) + return ext.Framework.RunPreFilterPlugins(ctx, cycleState, newPod) + } + } + return ext.Framework.RunPreFilterPlugins(ctx, cycleState, pod) +} + +func (ext *frameworkExtenderImpl) RunFilterPlugins(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeInfo *framework.NodeInfo) framework.PluginToStatus { + for _, hook := range ext.filterHooks { + // hook can change the args (cycleState, pod, nodeInfo) for filter plugins + newPod, newNodeInfo, hooked := hook.FilterHook(ext.handle, cycleState, pod, nodeInfo) + if hooked { + klog.V(5).InfoS("RunFilterPlugins hooked", "meet FilterPhaseHook", "hook", hook.Name(), "pod", klog.KObj(pod)) + return ext.Framework.RunFilterPlugins(ctx, cycleState, newPod, newNodeInfo) + } + } + return ext.Framework.RunFilterPlugins(ctx, cycleState, pod, nodeInfo) +} + // PluginFactoryProxy is used to proxy the call to the PluginFactory function and pass in the ExtendedHandle for the custom plugin func PluginFactoryProxy(extendHandle ExtendedHandle, factoryFn frameworkruntime.PluginFactory) frameworkruntime.PluginFactory { return func(args runtime.Object, handle framework.Handle) (framework.Plugin, error) { diff --git a/pkg/scheduler/plugins/gang/gang_cache.go b/pkg/scheduler/plugins/gang/gang_cache.go new file mode 100644 index 000000000..f84f24f4d --- /dev/null +++ b/pkg/scheduler/plugins/gang/gang_cache.go @@ -0,0 +1,383 @@ +package gang + +import ( + "github.com/koordinator-sh/koordinator/apis/extension" + "github.com/koordinator-sh/koordinator/pkg/util" + "k8s.io/api/core/v1" + "k8s.io/klog/v2" + "strconv" + "sync" + "time" +) + +type gangCache struct { + lock *sync.RWMutex + gangItems map[string]*Gang +} + +func NewGangCache() *gangCache { + return &gangCache{ + gangItems: make(map[string]*Gang), + lock: new(sync.RWMutex), + } +} + +func (gangCache *gangCache) onPodAdd(obj interface{}) { + pod, ok := obj.(*v1.Pod) + if !ok { + return + } + gangCache.AddPod(pod) +} + +func (gangCache *gangCache) onPodDelete(obj interface{}) { + pod, ok := obj.(*v1.Pod) + if !ok { + return + } + gangCache.DeletePod(pod) + gangName := pod.Annotations[extension.GangNameAnnotation] + //whether need to delete the gang from the gangCache + if num, found := gangCache.GetChildrenNum(gangName); found && num == 0 { + gangCache.DeleteGang(gangName) + } +} + +func (gangCache *gangCache) DeleteGang(gangName string) { + if gangName == "" { + return + } + gangCache.lock.Lock() + defer gangCache.lock.Unlock() + delete(gangCache.gangItems, gangName) +} + +func (gangCache *gangCache) HasGang(gangName string) bool { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if _, ok := gangCache.gangItems[gangName]; !ok { + return false + } + return true +} + +//Get functions +func (gangCache *gangCache) GetGangWaitTime(gangName string) (time.Duration, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return 0, false + } else { + return gang.WaitTime, true + } +} + +func (gangCache *gangCache) GetChildrenNum(gangName string) (int, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return 0, false + } else { + return len(gang.Children), true + } +} + +func (gangCache *gangCache) GetGangMinNum(gangName string) (int, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return 0, false + } else { + return gang.MinRequiredNumber, true + } +} + +func (gangCache *gangCache) GetGangTotalNum(gangName string) (int, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return 0, false + } else { + return gang.TotalChildrenNum, true + } +} + +func (gangCache *gangCache) GetGangMode(gangName string) (string, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return "", false + } else { + return gang.Mode, true + } +} + +func (gangCache *gangCache) GetGangAssumedPods(gangName string) (int, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return 0, false + } else { + return len(gang.WaitingForBindChildren) + len(gang.BoundChildren), true + } +} + +func (gangCache *gangCache) GetGangScheduleCycle(gangName string) (int, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return 0, false + } else { + return gang.ScheduleCycle, true + } +} + +func (gangCache *gangCache) GetChildScheduleCycle(gangName string, childName string) (int, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return 0, false + } else { + if cycle, found := gang.ChildrenScheduleRoundMap[childName]; !found { + return 0, false + } else { + return cycle, true + } + } +} + +func (gangCache *gangCache) GetCreateTime(gangName string) (time.Time, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return time.Time{}, false + } else { + return gang.CreateTime, true + } +} + +func (gangCache *gangCache) GetGangGroup(gangName string) ([]string, bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return nil, false + } else { + return gang.GangGroup, true + } +} + +func (gangCache *gangCache) IsGangResourceSatisfied(gangName string) (isSatisfied bool, found bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return false, false + } else { + return gang.ResourceSatisfied, true + } +} + +func (gangCache *gangCache) IsGangScheduleCycleValid(gangName string) (valid bool, found bool) { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + if gang, ok := gangCache.gangItems[gangName]; !ok { + return false, false + } else { + return gang.ScheduleCycleValid, true + } +} + +//Set functions +func (gangCache *gangCache) SetScheduleCycle(gangName string, scheduleCycle int) { + gangCache.lock.Lock() + defer gangCache.lock.Unlock() + gang := gangCache.gangItems[gangName] + gang.ScheduleCycle = scheduleCycle +} + +func (gangCache *gangCache) SetScheduleCycleValid(gangName string, valid bool) { + gangCache.lock.Lock() + defer gangCache.lock.Unlock() + gang := gangCache.gangItems[gangName] + gang.ScheduleCycleValid = valid +} + +func (gangCache *gangCache) SetChildCycle(gangName, childName string, childCycle int) { + gangCache.lock.Lock() + defer gangCache.lock.Unlock() + gang := gangCache.gangItems[gangName] + gang.ChildrenScheduleRoundMap[childName] = childCycle +} + +// CountChildNumWithCycle return how many children with the childCycle in the ChildrenScheduleRoundMap +func (gangCache *gangCache) CountChildNumWithCycle(gangName string, childCycle int) int { + gangCache.lock.RLock() + defer gangCache.lock.RUnlock() + gang := gangCache.gangItems[gangName] + num := 0 + for _, cycle := range gang.ChildrenScheduleRoundMap { + if cycle == childCycle { + num++ + } + } + return num +} + +func (gangCache *gangCache) AddPod(pod *v1.Pod) { + if util.IsPodTerminated(pod) { + gangCache.DeletePod(pod) + return + } + gangName := pod.Annotations[extension.GangNameAnnotation] + gangCache.lock.Lock() + defer gangCache.lock.Unlock() + var gang *Gang + if _, ok := gangCache.gangItems[gangName]; ok { + gang = gangCache.gangItems[gangName] + } else { + gang = gangCache.NewGangWithPod(pod) + } + podName := pod.Name + gang.Children[podName] = pod + gang.ChildrenScheduleRoundMap[podName] = 0 +} + +// NewGangWithPod +//create Gang depending on its first pod's Annotations +func (gangCache *gangCache) NewGangWithPod(pod *v1.Pod) *Gang { + gangName := pod.Annotations[extension.GangNameAnnotation] + minRequiredNumber := pod.Annotations[extension.GangMinNumAnnotation] + totalChildrenNum := pod.Annotations[extension.GangTotalNumAnnotation] + mode := pod.Annotations[extension.GangModeAnnotation] + waitTime := pod.Annotations[extension.GangWaitTimeAnnotation] + gangGroup := pod.Annotations[extension.GangGroupsAnnotation] + rawGang := NewGang(gangName) + if minRequiredNumber != "" { + num, err := strconv.Atoi(minRequiredNumber) + if err != nil { + klog.Errorf("pod's annotation MinRequiredNumber illegal,err:%v", err.Error()) + } else { + rawGang.MinRequiredNumber = num + } + } + if totalChildrenNum != "" { + num, err := strconv.Atoi(totalChildrenNum) + if err != nil { + klog.Errorf("pod's annotation totalChildrenNum illegal,err:%v", err.Error()) + } else { + rawGang.TotalChildrenNum = num + } + } else { + rawGang.TotalChildrenNum = rawGang.MinRequiredNumber + } + if mode != "" { + if mode != extension.StrictMode || mode != extension.NonStrictMode { + klog.Errorf("pod's annotation mode illegal,err:%v") + } else { + rawGang.Mode = mode + } + } + if waitTime != "" { + num, err := strconv.Atoi(waitTime) + if err != nil { + klog.Errorf("pod's annotation waitTime illegal,err:%v", err.Error()) + } else { + rawGang.WaitTime = time.Duration(num) * time.Second + } + } + if gangGroup != "" { + groupSlice, err := util.StringToGangGroupSlice(gangGroup) + if err != nil { + klog.Errorf("pod's annotation gangGroup illegal") + } else { + rawGang.GangGroup = groupSlice + } + } + return rawGang +} + +func (gangCache *gangCache) AddAssumedPod(pod *v1.Pod) { + if pod == nil { + return + } + gangName := pod.Annotations[extension.GangNameAnnotation] + gangCache.lock.Lock() + defer gangCache.lock.Unlock() + gang := gangCache.gangItems[gangName] + podName := pod.Name + delete(gang.BoundChildren, podName) + gang.WaitingForBindChildren[podName] = pod + if len(gang.WaitingForBindChildren)+len(gang.BoundChildren) >= gang.MinRequiredNumber { + gang.ResourceSatisfied = true + } +} + +func (gangCache *gangCache) AddBoundPod(pod *v1.Pod) { + if pod == nil { + return + } + gangName := pod.Annotations[extension.GangNameAnnotation] + gangCache.lock.Lock() + defer gangCache.lock.Unlock() + gang := gangCache.gangItems[gangName] + podName := pod.Name + delete(gang.WaitingForBindChildren, podName) + gang.BoundChildren[podName] = pod +} + +func (gangCache *gangCache) DeletePod(pod *v1.Pod) { + if pod == nil { + return + } + gangName := pod.Annotations[extension.GangNameAnnotation] + gangCache.lock.Lock() + defer gangCache.lock.Unlock() + gang := gangCache.gangItems[gangName] + podName := pod.Name + delete(gang.Children, podName) + delete(gang.WaitingForBindChildren, podName) + delete(gang.BoundChildren, podName) + delete(gang.ChildrenScheduleRoundMap, podName) + if len(gang.WaitingForBindChildren)+len(gang.BoundChildren) < gang.MinRequiredNumber { + gang.ResourceSatisfied = false + } +} + +// Gang basic gang info recorded in gangCache +type Gang struct { + Name string + WaitTime time.Duration + CreateTime time.Time + //strict-mode or non-strict-mode + Mode string + MinRequiredNumber int + TotalChildrenNum int + GangGroup []string + Children map[string]*v1.Pod + //pods that have already assumed(waiting in Permit stage) + WaitingForBindChildren map[string]*v1.Pod + //pods that have already bound + BoundChildren map[string]*v1.Pod + //if assumed pods number has reached to MinRequiredNumber + ResourceSatisfied bool + + //if the gang should be passed at PreFilter stage(Strict-Mode) + ScheduleCycleValid bool + //these fields used to count the cycle + ScheduleCycle int + ChildrenScheduleRoundMap map[string]int +} + +func NewGang(gangName string) *Gang { + return &Gang{ + Name: gangName, + CreateTime: time.Now(), + WaitTime: extension.DefaultGangWaitTime, + Mode: extension.StrictMode, + Children: make(map[string]*v1.Pod), + WaitingForBindChildren: make(map[string]*v1.Pod), + BoundChildren: make(map[string]*v1.Pod), + ScheduleCycleValid: true, + ScheduleCycle: 1, + ChildrenScheduleRoundMap: make(map[string]int), + } +} diff --git a/pkg/scheduler/plugins/gang/gang_plugin.go b/pkg/scheduler/plugins/gang/gang_plugin.go new file mode 100644 index 000000000..c58c7bfe8 --- /dev/null +++ b/pkg/scheduler/plugins/gang/gang_plugin.go @@ -0,0 +1,411 @@ +package gang + +import ( + "context" + "encoding/json" + "fmt" + "github.com/koordinator-sh/koordinator/apis/extension" + "github.com/koordinator-sh/koordinator/pkg/util" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + helpers "k8s.io/component-helpers/scheduling/corev1" + "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/scheduler/framework" + "time" +) + +const ( + Name = "Gang" +) + +var ( + _ framework.PreFilterPlugin = &GangPlugin{} + _ framework.PostFilterPlugin = &GangPlugin{} + _ framework.ReservePlugin = &GangPlugin{} + _ framework.PostBindPlugin = &GangPlugin{} + _ framework.PostBindPlugin = &GangPlugin{} + _ framework.QueueSortPlugin = &GangPlugin{} +) + +type GangPlugin struct { + frameworkHandler framework.Handle + podLister v1.PodLister + gangCache *gangCache +} + +func New(args runtime.Object, handle framework.Handle) (framework.Plugin, error) { + gangCache := NewGangCache() + //recover the gangCache + if err := RecoverGangCache(handle, gangCache); err != nil { + return nil, err + } + podInformer := handle.SharedInformerFactory().Core().V1().Pods().Informer() + podLister := handle.SharedInformerFactory().Core().V1().Pods().Lister() + podInformer.AddEventHandler(cache.FilteringResourceEventHandler{ + //FilterFunc: func(obj interface{}) bool { + // switch t := obj.(type) { + // case *corev1.Pod: + // return CheckPodGangInfo(t) + // default: + // utilruntime.HandleError(fmt.Errorf("unable to handle object %T", obj)) + // return false + // } + //}, + Handler: cache.ResourceEventHandlerFuncs{ + + AddFunc: gangCache.onPodAdd, + DeleteFunc: gangCache.onPodDelete, + }, + }) + return &GangPlugin{ + frameworkHandler: handle, + podLister: podLister, + gangCache: gangCache, + }, nil +} + +func (p *GangPlugin) Name() string { return Name } + +//Less is used to sort pods in the scheduling queue in the following order. +//Firstly, compare the priorities of the two pods, the higher priority (if pod's priority is equal,then compare their KoordinatorPriority at labels )is at the front of the queue, +//Secondly, compare creationTimestamp of two pods, if pod belongs to a Gang, then we compare creationTimestamp of the Gang, the one created first will be at the front of the queue. +//Finally, compare pod's namespace, if pod belongs to a Gang, then we compare Gang name. +func (p *GangPlugin) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { + prio1 := helpers.PodPriority(podInfo1.Pod) + prio2 := helpers.PodPriority(podInfo2.Pod) + if prio1 != prio2 { + return prio1 > prio2 + } + subPrio1, err := util.GetSubPriority(podInfo1.Pod) + if err != nil { + klog.Errorf("GetSubPriority of the pod %s err:%v", podInfo1.Pod.Name, err) + } + subPrio2, err := util.GetSubPriority(podInfo2.Pod) + if err != nil { + klog.Errorf("GetSubPriority of the pod %s err:%v", podInfo2.Pod.Name, err) + } + if subPrio1 != subPrio2 { + return subPrio1 > subPrio2 + } + + creationTime1 := p.GetCreatTime(podInfo1) + creationTime2 := p.GetCreatTime(podInfo2) + if creationTime1.Equal(creationTime2) { + return util.GetNamespacedName(podInfo1.Pod) < util.GetNamespacedName(podInfo2.Pod) + } + return creationTime1.Before(creationTime2) +} + +// PreFilter +//if non-strict-mode, we only do step1 and step2: +// i.Check whether childes in Gang has met the requirements of minimum number under each Gang, and reject the pod if negative. +// ii.Check whether the Gang has been timeout(check the pod's annotation,later introduced at Permit section), and reject the pod if positive. +// iii.Check whether the Gang has met the scheduleCycleValid check, and reject the pod if negative. +// iv.Try update scheduleCycle, scheduleCycleValid, childrenScheduleRoundMap as mentioned above. +func (p *GangPlugin) PreFilter(ctx context.Context, state *framework.CycleState, pod *corev1.Pod) *framework.Status { + gangCache := p.gangCache + gangName := pod.Annotations[extension.GangNameAnnotation] + mode, found := gangCache.GetGangMode(gangName) + if !found { + klog.Infof("pre-filter pod %v from Gang %v rejected,didn't find the Gang in the cache ", pod.Name, gangName) + return framework.NewStatus(framework.Unschedulable, "can not find gang in the gang cache") + } + if err := p.PreFilterCheck(pod, gangName, mode); err != nil { + klog.Errorf("PreFilter failed err:%s", err.Error) + return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) + } + return framework.NewStatus(framework.Success, "") +} + +// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one. +func (p *GangPlugin) PreFilterExtensions() framework.PreFilterExtensions { + return nil +} + +// PostFilter +//i. If strict-mode, we will set scheduleCycleValid to false and release all assumed pods. +//ii. If non-strict mode, we will do nothing. +func (p *GangPlugin) PostFilter(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, + filteredNodeStatusMap framework.NodeToStatusMap) (*framework.PostFilterResult, *framework.Status) { + gangCache := p.gangCache + gangName := pod.Annotations[extension.GangNameAnnotation] + mode, found := gangCache.GetGangMode(gangName) + if !found { + klog.Infof("pre-filter pod %v from Gang %v rejected,didn't find the Gang in the cache ", pod.Name, gangName) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, "can not find gang in the gang cache") + } + if mode == extension.StrictMode { + p.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Annotations[extension.GangNameAnnotation] == gangName { + klog.Errorf("postFilter rejects the pod name:%v from Gang %s", pod.Name, gangName) + waitingPod.Reject(p.Name(), "optimistic rejection in PostFilter") + } + }) + gangCache.SetScheduleCycleValid(gangName, false) + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, + fmt.Sprintf("Gang %v gets rejected this cycle due to Pod %v is unschedulable even after PostFilter in StrictMode", gangName, pod.Name)) + } + return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable, + fmt.Sprintf("Pod %v from Gang %v is unschedulable in NonStrictMode", gangName, pod.Name)) +} + +// Permit +//we will calculate all Gangs in GangGroup whether the current number of assumed-pods in each Gang meets the Gang's minimum requirement. +//and decide whether we should let the pod wait in Permit stage or let the whole gangGroup go binding +func (p *GangPlugin) Permit(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) (*framework.Status, time.Duration) { + waitTime, s := p.PermitCheck(pod) + var retStatus *framework.Status + switch s { + case extension.GangNotFoundInCache: + return framework.NewStatus(framework.Unschedulable, "Gang not found in gangCache"), 0 + case extension.Wait: + klog.Infof("Pod %v from gang %v is waiting to be scheduled at Permit stage", pod.Name, pod.Annotations[extension.GangNameAnnotation]) + retStatus = framework.NewStatus(framework.Wait) + p.ActivateGang(pod, state) + case extension.Success: + p.AllowGangGroup(pod) + retStatus = framework.NewStatus(framework.Success) + waitTime = 0 + } + return retStatus, waitTime +} + +// Reserve is the functions invoked by the framework at "reserve" extension point. +func (p *GangPlugin) Reserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) *framework.Status { + return nil +} + +// Unreserve +//(1)handle the timeout gang +//(2)do nothing when bound failed +func (p *GangPlugin) Unreserve(ctx context.Context, state *framework.CycleState, pod *corev1.Pod, nodeName string) { + gangName := pod.Annotations[extension.GangNameAnnotation] + gangCache := p.gangCache + resourceSatisfied, _ := gangCache.IsGangResourceSatisfied(gangName) + + //gang time out + if !resourceSatisfied { + klog.Infof("gang %v is time out,start to release the assumed resource and add annotations to the gang's children") + timeoutAnnotations := map[string]interface{}{ + "metadata": map[string]map[string]string{ + "Annotations": { + extension.GangTimeOutAnnotation: "true", + }}, + } + pods, err := p.podLister.List(nil) + if err != nil { + klog.Errorf("unReserve list pod err : %v", err.Error()) + return + } + //add timeout annotation to all the children of the gang + for _, pod := range pods { + if pod.Annotations[extension.GangNameAnnotation] == gangName { + ns := pod.Namespace + podName := pod.Name + updateAnnotation, _ := json.Marshal(timeoutAnnotations) + _, err := p.frameworkHandler.ClientSet().CoreV1().Pods(ns).Patch(ctx, podName, types.StrategicMergePatchType, updateAnnotation, metav1.PatchOptions{}) + if err != nil { + klog.Errorf("unReserve when patch annotation to pod err : %v", err.Error()) + } + } + } + //release resource of all assumed children of the gang + p.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Annotations[extension.GangNameAnnotation] == gangName { + klog.Errorf("unReserve rejects the pod name:%v from Gang %s due to timeout", pod.Name, gangName) + waitingPod.Reject(p.Name(), "optimistic rejection in unReserve due to timeout") + } + }) + } + return +} + +// PostBind just update the gang's BoundChildren +func (p *GangPlugin) PostBind(ctx context.Context, _ *framework.CycleState, pod *corev1.Pod, nodeName string) { + gangCache := p.gangCache + gangCache.AddBoundPod(pod) + return +} + +func RecoverGangCache(handle framework.Handle, gangCache *gangCache) error { + podLister := handle.SharedInformerFactory().Core().V1().Pods().Lister() + podsList, err := podLister.List(nil) + if err != nil { + klog.Errorf("RecoverGangCache podsList List error %+v", err) + return err + } + for _, pod := range podsList { + if pod.Annotations[extension.GangNameAnnotation] != "" { + gangCache.onPodAdd(pod) + if pod.Spec.NodeName != "" { + //todo:没想好如何区分assumedpod 和 boundpod,暂时先按assumed处理,不影响permit计数 + gangCache.AddAssumedPod(pod) + } + } + } + //todo:严格模式下 schedulingCycle 如何recover呢? + return nil +} + +func (p *GangPlugin) PreFilterCheck(pod *corev1.Pod, gangName string, mode string) error { + gangCache := p.gangCache + var currentChildrenNum int + var minRequireChildrenNum int + var gangScheduleCycle int + var podScheduleCycle int + var found bool + //check if reach MinNumber + if currentChildrenNum, found = gangCache.GetChildrenNum(gangName); !found { + return fmt.Errorf("pre-filter pod %v from Gang %v rejected,didn't find the Gang in the cache ", pod.Name, gangName) + } + if minRequireChildrenNum, found = gangCache.GetGangMinNum(gangName); !found { + return fmt.Errorf("pre-filter pod %v from Gang %v rejected,didn't find the Gang in the cache ", pod.Name, gangName) + } + + if currentChildrenNum < minRequireChildrenNum { + return fmt.Errorf("pre-filter pod %v cannot find enough children pods from Gang %v, "+ + "current children number: %v, minRequiredNumber of Gang is %v", pod.Name, gangName, currentChildrenNum, minRequireChildrenNum) + } + //check if Gang is timeout + if pod.Annotations[extension.GangTimeOutAnnotation] == "true" { + return fmt.Errorf("pre-filter pod %v from Gang %v rejected,Gang is timeout", pod.Name, gangName) + } + + if mode == extension.StrictMode { + if gangScheduleCycle, found = gangCache.GetGangScheduleCycle(gangName); !found { + return fmt.Errorf("pre-filter pod %v from Gang %v rejected,didn't find the Gang in the cache ", pod.Name, gangName) + } + if podScheduleCycle, found = gangCache.GetChildScheduleCycle(gangName, pod.Name); !found { + return fmt.Errorf("pre-filter pod %v from Gang %v rejected,didn't find the Gang in the cache ", pod.Name, gangName) + } + //firstly, filter the pods whose cycle is greater than GangScheduleCycle, + //Actually,there shouldn't be the greater condition,at most a pod is scheduled twice in this gangScheduleCycle + //So we don't add it's podCycle,remaining equal with gangScheduleCycle + if podScheduleCycle >= gangScheduleCycle { + klog.Errorf("pre-filter pod's cycle is greater than GangScheduleCycle", pod.Name, gangName) + } + //secondly, set the pod's cycle equal with gangScheduleCycle + gangCache.SetChildCycle(gangName, pod.Name, gangScheduleCycle) + //check the if gang's cycle valid + if valid, found := gangCache.IsGangScheduleCycleValid(gangName); !found { + return fmt.Errorf("pre-filter pod %v from Gang %v rejected,didn't find the Gang in the cache ", pod.Name, gangName) + } else { + if !valid { + return fmt.Errorf("pre-filter pod %v from Gang %v rejected,Gang's ScheduleCycle is not valid", pod.Name, gangName) + } + } + //finally, check if all the pods in this gangScheduleCycle has been handled + if gangTotalNum, found := gangCache.GetGangTotalNum(gangName); !found { + return fmt.Errorf("pre-filter pod %v from Gang %v rejected,didn't find the Gang in the cache ", pod.Name, gangName) + } else { + if gangCache.CountChildNumWithCycle(gangName, gangScheduleCycle) == gangTotalNum { + gangCache.SetScheduleCycleValid(gangName, true) + gangCache.SetScheduleCycle(gangName, gangScheduleCycle+1) + } + } + } + return nil +} + +func (p *GangPlugin) PermitCheck(pod *corev1.Pod) (time.Duration, extension.Status) { + gangName := pod.Annotations[extension.GangNameAnnotation] + gangCache := p.gangCache + waitTime, found := gangCache.GetGangWaitTime(gangName) + if !found { + return 0, extension.GangNotFoundInCache + } + //first we need to add the pod to assumedMap of gang + gangCache.AddAssumedPod(pod) + gangGroup, _ := gangCache.GetGangGroup(gangName) + allGangGroupSatisfied := true + //only the gang itself + if len(gangGroup) == 0 { + allGangGroupSatisfied, _ = gangCache.IsGangResourceSatisfied(gangName) + } else { + //check each gang group + for _, groupName := range gangGroup { + if satisfied, _ := gangCache.IsGangResourceSatisfied(groupName); !satisfied { + allGangGroupSatisfied = false + break + } + } + } + if !allGangGroupSatisfied { + return waitTime, extension.Wait + } + return 0, extension.Success +} + +// ActivateGang +//Put all the pods belong to the Gang which in UnSchedulableQueue or backoffQueue back to activeQueue, +func (p *GangPlugin) ActivateGang(pod *corev1.Pod, state *framework.CycleState) { + gangName := pod.Annotations[extension.GangNameAnnotation] + pods, err := p.podLister.Pods(pod.Namespace).List(nil) + if err != nil { + klog.Errorf("ActivateGang Failed to list pods belong to a Gang: %v", gangName) + return + } + for i := range pods { + if pods[i].UID == pod.UID { + pods = append(pods[:i], pods[i+1:]...) + break + } + } + if len(pods) != 0 { + if c, err := state.Read(framework.PodsToActivateKey); err == nil { + if s, ok := c.(*framework.PodsToActivate); ok { + s.Lock() + for _, pod := range pods { + namespacedName := util.GetNamespacedName(pod) + s.Map[namespacedName] = pod + } + s.Unlock() + } + } + } +} + +func (p *GangPlugin) AllowGangGroup(pod *corev1.Pod) { + gangName := pod.Annotations[extension.GangNameAnnotation] + gangCache := p.gangCache + gangGroup, _ := gangCache.GetGangGroup(gangName) + //allow only the gang itself + if len(gangGroup) == 0 { + p.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Annotations[extension.GangNameAnnotation] == gangName { + klog.Infof("Permit allows pod %v from gang %v", waitingPod.GetPod().Name, gangName) + waitingPod.Allow(p.Name()) + } + }) + } else { + //allow each gang group + for _, groupName := range gangGroup { + p.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) { + if waitingPod.GetPod().Annotations[extension.GangNameAnnotation] == groupName { + klog.Infof("Permit allows pod %v from gang %v", waitingPod.GetPod().Name, gangName) + waitingPod.Allow(p.Name()) + } + }) + } + } + klog.Infof("Permit allows pod %v from gang %v", pod.Name, gangName) +} + +func (p *GangPlugin) GetCreatTime(podInfo *framework.QueuedPodInfo) time.Time { + gangName := podInfo.Pod.Annotations[extension.GangNameAnnotation] + //it doesn't belong to the gang,we get the creation time of the pod + if gangName == "" { + return podInfo.InitialAttemptTimestamp + } + //it belongs to a gang,we get the creation time of the Gang + gangCache := p.gangCache + createTime, found := gangCache.GetCreateTime(gangName) + if !found { + klog.Infof("GetGangCreatTime: gang %v is not found in the cache", gangName) + } + return createTime +} diff --git a/pkg/scheduler/plugins/nodenumaresource/cpu_allocator.go b/pkg/scheduler/plugins/nodenumaresource/cpu_allocator.go index 477abd2cf..3079e780e 100644 --- a/pkg/scheduler/plugins/nodenumaresource/cpu_allocator.go +++ b/pkg/scheduler/plugins/nodenumaresource/cpu_allocator.go @@ -20,7 +20,8 @@ import ( "fmt" "sort" - "github.com/koordinator-sh/koordinator/apis/extension" + "k8s.io/apimachinery/pkg/util/sets" + schedulingconfig "github.com/koordinator-sh/koordinator/apis/scheduling/config" ) @@ -41,11 +42,11 @@ func takeCPUs( availableCPUs CPUSet, allocatedCPUs CPUDetails, numCPUsNeeded int, - cpuBindPolicy extension.CPUBindPolicy, - exclusive bool, + cpuBindPolicy schedulingconfig.CPUBindPolicy, + cpuExclusivePolicy schedulingconfig.CPUExclusivePolicy, numaAllocatedStrategy schedulingconfig.NUMAAllocateStrategy, ) (CPUSet, error) { - acc := newCPUAccumulator(topology, availableCPUs, allocatedCPUs, numCPUsNeeded, exclusive, numaAllocatedStrategy) + acc := newCPUAccumulator(topology, availableCPUs, allocatedCPUs, numCPUsNeeded, cpuExclusivePolicy, numaAllocatedStrategy) if acc.isSatisfied() { return acc.result, nil } @@ -58,12 +59,15 @@ func takeCPUs( // According to the NUMA allocation strategy, // select the NUMA Node with the most remaining amount or the least amount remaining // and the total amount of available CPUs in the NUMA Node is greater than or equal to the number of CPUs needed + filterExclusiveArgs := []bool{true, false} if acc.numCPUsNeeded <= acc.topology.CPUsPerNode() { - freeCPUs := acc.freeCoresInNode(true) - for _, cpus := range freeCPUs { - if len(cpus) >= acc.numCPUsNeeded { - acc.take(cpus[:acc.numCPUsNeeded]...) - return acc.result, nil + for _, filterExclusive := range filterExclusiveArgs { + freeCPUs := acc.freeCoresInNode(true, filterExclusive) + for _, cpus := range freeCPUs { + if len(cpus) >= acc.numCPUsNeeded { + acc.take(cpus[:acc.numCPUsNeeded]...) + return acc.result, nil + } } } } @@ -181,10 +185,12 @@ func takeCPUs( type cpuAccumulator struct { topology *CPUTopology - details CPUDetails + allocatableCPUs CPUDetails numCPUsNeeded int exclusive bool - exclusiveInCores map[int]bool + exclusiveInCores sets.Int + exclusiveInNUMANodes sets.Int + exclusivePolicy schedulingconfig.CPUExclusivePolicy numaAllocateStrategy schedulingconfig.NUMAAllocateStrategy result CPUSet } @@ -194,22 +200,32 @@ func newCPUAccumulator( availableCPUs CPUSet, allocatedCPUs CPUDetails, numCPUsNeeded int, - exclusive bool, - numaSortStrategy schedulingconfig.NUMAAllocateStrategy, + exclusivePolicy schedulingconfig.CPUExclusivePolicy, + numaAllocateStrategy schedulingconfig.NUMAAllocateStrategy, ) *cpuAccumulator { - exclusiveInCores := make(map[int]bool) + exclusiveInCores := sets.NewInt() + exclusiveInNUMANodes := sets.NewInt() for _, v := range allocatedCPUs { - exclusiveInCores[v.CoreID] = v.Exclusive + if v.ExclusivePolicy == schedulingconfig.CPUExclusivePolicyPCPULevel { + exclusiveInCores.Insert(v.CoreID) + } else if v.ExclusivePolicy == schedulingconfig.CPUExclusivePolicyNUMANodeLevel { + exclusiveInNUMANodes.Insert(v.NodeID) + } } - details := topology.CPUDetails.KeepOnly(availableCPUs) + allocatableCPUs := topology.CPUDetails.KeepOnly(availableCPUs) + + exclusive := exclusivePolicy == schedulingconfig.CPUExclusivePolicyPCPULevel || + exclusivePolicy == schedulingconfig.CPUExclusivePolicyNUMANodeLevel return &cpuAccumulator{ topology: topology, - details: details, + allocatableCPUs: allocatableCPUs, exclusiveInCores: exclusiveInCores, + exclusiveInNUMANodes: exclusiveInNUMANodes, exclusive: exclusive, + exclusivePolicy: exclusivePolicy, numCPUsNeeded: numCPUsNeeded, - numaAllocateStrategy: numaSortStrategy, + numaAllocateStrategy: numaAllocateStrategy, result: NewCPUSet(), } } @@ -217,10 +233,14 @@ func newCPUAccumulator( func (a *cpuAccumulator) take(cpus ...int) { a.result = a.result.UnionSlice(cpus...) for _, cpu := range cpus { - delete(a.details, cpu) + delete(a.allocatableCPUs, cpu) if a.exclusive { cpuInfo := a.topology.CPUDetails[cpu] - a.exclusiveInCores[cpuInfo.CoreID] = true + if a.exclusivePolicy == schedulingconfig.CPUExclusivePolicyPCPULevel { + a.exclusiveInCores.Insert(cpuInfo.CoreID) + } else if a.exclusivePolicy == schedulingconfig.CPUExclusivePolicyNUMANodeLevel { + a.exclusiveInNUMANodes.Insert(cpuInfo.NodeID) + } } } a.numCPUsNeeded -= len(cpus) @@ -235,15 +255,21 @@ func (a *cpuAccumulator) isSatisfied() bool { } func (a *cpuAccumulator) isFailed() bool { - return a.numCPUsNeeded > len(a.details) + return a.numCPUsNeeded > len(a.allocatableCPUs) +} + +func (a *cpuAccumulator) isCPUExclusivePCPULevel(cpuInfo *CPUInfo) bool { + if a.exclusivePolicy != schedulingconfig.CPUExclusivePolicyPCPULevel { + return false + } + return a.exclusiveInCores.Has(cpuInfo.CoreID) } -func (a *cpuAccumulator) isCPUExclusive(cpuID int) bool { - if !a.exclusive { +func (a *cpuAccumulator) isCPUExclusiveNUMANodeLevel(cpuInfo *CPUInfo) bool { + if a.exclusivePolicy != schedulingconfig.CPUExclusivePolicyNUMANodeLevel { return false } - cpuInfo := a.topology.CPUDetails[cpuID] - return a.exclusiveInCores[cpuInfo.CoreID] + return a.exclusiveInNUMANodes.Has(cpuInfo.NodeID) } func (a *cpuAccumulator) extractCPU(cpus []int) []int { @@ -278,12 +304,15 @@ func (a *cpuAccumulator) sortCores(cores []int, cpusInCores map[int][]int) { } // freeCoresInNode returns the logical cpus of the free cores in nodes that sorted -func (a *cpuAccumulator) freeCoresInNode(filterFullFreeCore bool) [][]int { - details := a.details +func (a *cpuAccumulator) freeCoresInNode(filterFullFreeCore bool, filterExclusive bool) [][]int { + allocatableCPUs := a.allocatableCPUs socketFreeScores := make(map[int]int) cpusInCores := make(map[int][]int) - for _, cpuInfo := range details { + for _, cpuInfo := range allocatableCPUs { + if filterExclusive && a.isCPUExclusiveNUMANodeLevel(&cpuInfo) { + continue + } cpus := cpusInCores[cpuInfo.CoreID] if len(cpus) == 0 { cpus = make([]int, 0, a.topology.CPUsPerCore()) @@ -299,7 +328,7 @@ func (a *cpuAccumulator) freeCoresInNode(filterFullFreeCore bool) [][]int { if filterFullFreeCore && len(cpus) != a.topology.CPUsPerCore() { continue } - info := details[cpus[0]] + info := allocatableCPUs[cpus[0]] cores := coresInNodes[info.NodeID] if len(cores) == 0 { cores = make([]int, 0, a.topology.CPUsPerNode()/a.topology.CPUsPerCore()) @@ -328,8 +357,8 @@ func (a *cpuAccumulator) freeCoresInNode(filterFullFreeCore bool) [][]int { jCPUs := cpusInNodes[nodeIDs[j]] // each cpu's socketId and nodeId in same node are same - iCPUInfo := details[iCPUs[0]] - jCPUInfo := details[jCPUs[0]] + iCPUInfo := allocatableCPUs[iCPUs[0]] + jCPUInfo := allocatableCPUs[jCPUs[0]] iSocket := iCPUInfo.SocketID jSocket := jCPUInfo.SocketID @@ -369,10 +398,10 @@ func (a *cpuAccumulator) freeCoresInNode(filterFullFreeCore bool) [][]int { // freeCoresInSocket returns the logical cpus of the free cores in sockets that sorted func (a *cpuAccumulator) freeCoresInSocket(filterFullFreeCore bool) [][]int { - details := a.details + allocatableCPUs := a.allocatableCPUs cpusInCores := make(map[int][]int) - for _, cpuInfo := range details { + for _, cpuInfo := range allocatableCPUs { cpus := cpusInCores[cpuInfo.CoreID] if len(cpus) == 0 { cpus = make([]int, 0, a.topology.CPUsPerCore()) @@ -387,7 +416,7 @@ func (a *cpuAccumulator) freeCoresInSocket(filterFullFreeCore bool) [][]int { if filterFullFreeCore && len(cpus) != a.topology.CPUsPerCore() { continue } - info := details[cpus[0]] + info := allocatableCPUs[cpus[0]] cores := coresInSockets[info.SocketID] if len(cores) == 0 { cores = make([]int, 0, a.topology.CPUsPerSocket()/a.topology.CPUsPerCore()) @@ -438,8 +467,8 @@ func (a *cpuAccumulator) freeCPUsInNode(filterExclusive bool) [][]int { cpusInNodes := make(map[int][]int) nodeFreeScores := make(map[int]int) socketFreeScores := make(map[int]int) - for _, cpuInfo := range a.details { - if filterExclusive && a.isCPUExclusive(cpuInfo.CPUID) { + for _, cpuInfo := range a.allocatableCPUs { + if filterExclusive && (a.isCPUExclusivePCPULevel(&cpuInfo) || a.isCPUExclusiveNUMANodeLevel(&cpuInfo)) { continue } cpus := cpusInNodes[cpuInfo.NodeID] @@ -466,8 +495,8 @@ func (a *cpuAccumulator) freeCPUsInNode(filterExclusive bool) [][]int { iCPUs := cpusInNodes[nodeIDs[i]] jCPUs := cpusInNodes[nodeIDs[j]] - iCPUInfo := a.details[iCPUs[0]] - jCPUInfo := a.details[jCPUs[0]] + iCPUInfo := a.allocatableCPUs[iCPUs[0]] + jCPUInfo := a.allocatableCPUs[jCPUs[0]] iNode := iCPUInfo.NodeID jNode := jCPUInfo.NodeID @@ -510,10 +539,10 @@ func (a *cpuAccumulator) freeCPUsInNode(filterExclusive bool) [][]int { // freeCPUsInSocket returns free logical cpus in sockets that sorted in ascending order. func (a *cpuAccumulator) freeCPUsInSocket(filterExclusive bool) [][]int { - details := a.details + allocatableCPUs := a.allocatableCPUs cpusInSockets := make(map[int][]int) - for _, cpuInfo := range details { - if filterExclusive && a.isCPUExclusive(cpuInfo.CPUID) { + for _, cpuInfo := range allocatableCPUs { + if filterExclusive && a.isCPUExclusivePCPULevel(&cpuInfo) { continue } cpus := cpusInSockets[cpuInfo.SocketID] @@ -564,14 +593,14 @@ func (a *cpuAccumulator) freeCPUsInSocket(filterExclusive bool) [][]int { // - node ID // - core ID func (a *cpuAccumulator) freeCPUs(filterExclusive bool) []int { - details := a.details + allocatableCPUs := a.allocatableCPUs cpusInCores := make(map[int][]int) coresToSocket := make(map[int]int) coresToNode := make(map[int]int) nodeFreeScores := make(map[int]int) socketFreeScores := make(map[int]int) - for _, cpuInfo := range details { - if filterExclusive && a.isCPUExclusive(cpuInfo.CPUID) { + for _, cpuInfo := range allocatableCPUs { + if filterExclusive && (a.isCPUExclusivePCPULevel(&cpuInfo) || a.isCPUExclusiveNUMANodeLevel(&cpuInfo)) { continue } diff --git a/pkg/scheduler/plugins/nodenumaresource/cpu_allocator_test.go b/pkg/scheduler/plugins/nodenumaresource/cpu_allocator_test.go index b00f85a49..9bf12b4bd 100644 --- a/pkg/scheduler/plugins/nodenumaresource/cpu_allocator_test.go +++ b/pkg/scheduler/plugins/nodenumaresource/cpu_allocator_test.go @@ -20,8 +20,6 @@ import ( "reflect" "testing" - "k8s.io/apimachinery/pkg/util/sets" - schedulingconfig "github.com/koordinator-sh/koordinator/apis/scheduling/config" ) @@ -145,7 +143,7 @@ func TestTakeFullPCPUs(t *testing.T) { allocatedCPUsDetails := tt.topology.CPUDetails.KeepOnly(tt.allocatedCPUs) result, err := takeCPUs( tt.topology, availableCPUs, allocatedCPUsDetails, - tt.numCPUsNeeded, schedulingconfig.CPUBindPolicyFullPCPUs, false, schedulingconfig.NUMAMostAllocated) + tt.numCPUsNeeded, schedulingconfig.CPUBindPolicyFullPCPUs, schedulingconfig.CPUExclusivePolicyNone, schedulingconfig.NUMAMostAllocated) if tt.wantError && err == nil { t.Fatal("expect error but got nil") } else if !tt.wantError && err != nil { @@ -249,7 +247,7 @@ func TestTakeFullPCPUsWithNUMALeastAllocated(t *testing.T) { allocatedCPUsDetails := tt.topology.CPUDetails.KeepOnly(tt.allocatedCPUs) result, err := takeCPUs( tt.topology, availableCPUs, allocatedCPUsDetails, - tt.numCPUsNeeded, schedulingconfig.CPUBindPolicyFullPCPUs, false, schedulingconfig.NUMALeastAllocated) + tt.numCPUsNeeded, schedulingconfig.CPUBindPolicyFullPCPUs, schedulingconfig.CPUExclusivePolicyNone, schedulingconfig.NUMALeastAllocated) if tt.wantError && err == nil { t.Fatal("expect error but got nil") } else if !tt.wantError && err != nil { @@ -262,6 +260,16 @@ func TestTakeFullPCPUsWithNUMALeastAllocated(t *testing.T) { } } +func TestCPUSpreadByPCPUs(t *testing.T) { + topology := buildCPUTopologyForTest(2, 2, 4, 2) + acc := newCPUAccumulator(topology, topology.CPUDetails.CPUs(), nil, 8, schedulingconfig.CPUExclusivePolicyNone, schedulingconfig.NUMAMostAllocated) + result := acc.freeCPUs(false) + result = acc.spreadCPUs(result) + if !reflect.DeepEqual([]int{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}, result) { + t.Fatal("unexpect spread result") + } +} + func TestTakeSpreadByPCPUs(t *testing.T) { tests := []struct { name string @@ -306,7 +314,7 @@ func TestTakeSpreadByPCPUs(t *testing.T) { allocatedCPUsDetails := tt.topology.CPUDetails.KeepOnly(tt.allocatedCPUs) result, err := takeCPUs( tt.topology, availableCPUs, allocatedCPUsDetails, - tt.numCPUsNeeded, schedulingconfig.CPUBindPolicySpreadByPCPUs, false, schedulingconfig.NUMAMostAllocated) + tt.numCPUsNeeded, schedulingconfig.CPUBindPolicySpreadByPCPUs, schedulingconfig.CPUExclusivePolicyNone, schedulingconfig.NUMAMostAllocated) if tt.wantError && err == nil { t.Fatal("expect error but got nil") } else if !tt.wantError && err != nil { @@ -319,6 +327,16 @@ func TestTakeSpreadByPCPUs(t *testing.T) { } } +func TestCPUSpreadByPCPUsWithNUMALeastAllocated(t *testing.T) { + topology := buildCPUTopologyForTest(2, 2, 4, 2) + acc := newCPUAccumulator(topology, topology.CPUDetails.CPUs(), nil, 8, schedulingconfig.CPUExclusivePolicyNone, schedulingconfig.NUMALeastAllocated) + result := acc.freeCPUs(false) + result = acc.spreadCPUs(result) + if !reflect.DeepEqual([]int{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}, result) { + t.Fatal("unexpect spread result") + } +} + func TestTakeSpreadByPCPUsWithNUMALeastAllocated(t *testing.T) { tests := []struct { name string @@ -363,7 +381,7 @@ func TestTakeSpreadByPCPUsWithNUMALeastAllocated(t *testing.T) { allocatedCPUsDetails := tt.topology.CPUDetails.KeepOnly(tt.allocatedCPUs) result, err := takeCPUs( tt.topology, availableCPUs, allocatedCPUsDetails, - tt.numCPUsNeeded, schedulingconfig.CPUBindPolicySpreadByPCPUs, false, schedulingconfig.NUMALeastAllocated) + tt.numCPUsNeeded, schedulingconfig.CPUBindPolicySpreadByPCPUs, schedulingconfig.CPUExclusivePolicyNone, schedulingconfig.NUMALeastAllocated) if tt.wantError && err == nil { t.Fatal("expect error but got nil") } else if !tt.wantError && err != nil { @@ -376,120 +394,110 @@ func TestTakeSpreadByPCPUsWithNUMALeastAllocated(t *testing.T) { } } -func TestTakeSpreadByPCPUsWithCoreLevelExclusive(t *testing.T) { +func TestTakeCPUsWithExclusivePolicy(t *testing.T) { tests := []struct { - name string - topology *CPUTopology - allocatedCPUs CPUSet - cpusInApps CPUSet - cpusInServiceUnits map[string][]int - currentAppName string - currentServiceUnit string - cpuMutexApps sets.String - cpuMutexServiceUnits sets.String - numCPUsNeeded int - wantError bool - wantResult CPUSet + name string + topology *CPUTopology + allocatedExclusiveCPUs CPUSet + allocatedExclusivePolicy schedulingconfig.CPUExclusivePolicy + bindPolicy schedulingconfig.CPUBindPolicy + exclusivePolicy schedulingconfig.CPUExclusivePolicy + numCPUsNeeded int + wantError bool + wantResult CPUSet }{ { - name: "allocate cpus on full-free socket with other CPUMutex apps", - topology: buildCPUTopologyForTest(2, 1, 4, 2), - allocatedCPUs: NewCPUSet(0, 2), - cpusInApps: NewCPUSet(0, 2), - cpuMutexApps: sets.NewString("test-app-1", "test-app-2"), - currentAppName: "test-app-2", - currentServiceUnit: "test-app-2-host", - numCPUsNeeded: 4, - wantResult: NewCPUSet(8, 10, 12, 14), - }, - { - name: "allocate cpus on full-free socket with same CPUMutex apps", - topology: buildCPUTopologyForTest(2, 1, 4, 2), - allocatedCPUs: NewCPUSet(0, 2), - cpusInApps: NewCPUSet(0, 2), - cpuMutexApps: sets.NewString("test-app-1"), - currentAppName: "test-app-1", - currentServiceUnit: "test-app-1-host", - numCPUsNeeded: 4, - wantResult: NewCPUSet(8, 10, 12, 14), - }, - { - name: "allocate overlapped cpus with same CPUMutex apps", - topology: buildCPUTopologyForTest(2, 1, 4, 2), - cpuMutexApps: sets.NewString("test-app-1"), - currentAppName: "test-app-1", - currentServiceUnit: "test-app-1-host", - numCPUsNeeded: 10, - wantResult: NewCPUSet(0, 1, 2, 3, 4, 6, 8, 10, 12, 14), - }, - { - name: "allocate cpus on large-size partially-allocated socket with other CPUMutex apps", - topology: buildCPUTopologyForTest(2, 1, 8, 2), - allocatedCPUs: NewCPUSet(0, 2), - cpusInApps: NewCPUSet(0, 2), - cpuMutexApps: sets.NewString("test-app-1", "test-app-2"), - currentAppName: "test-app-2", - currentServiceUnit: "test-app-2-host", - numCPUsNeeded: 4, - wantResult: NewCPUSet(4, 6, 8, 10), - }, - { - name: "allocate cpus on full-free socket with other CPUMutex serviceUnits", - topology: buildCPUTopologyForTest(2, 1, 4, 2), - allocatedCPUs: NewCPUSet(0, 2), - cpusInApps: NewCPUSet(0, 2), - cpuMutexServiceUnits: sets.NewString("test-app-1-host", "test-app-2-host"), - currentAppName: "test-app-2", - currentServiceUnit: "test-app-2-host", - numCPUsNeeded: 4, - wantResult: NewCPUSet(8, 10, 12, 14), - }, - { - name: "allocate cpus on full-free socket with same CPUMutex serviceUnits", - topology: buildCPUTopologyForTest(2, 1, 4, 2), - allocatedCPUs: NewCPUSet(0, 2), - cpusInApps: NewCPUSet(0, 2), - cpuMutexServiceUnits: sets.NewString("test-app-1-host"), - currentAppName: "test-app-1", - currentServiceUnit: "test-app-1-host", - numCPUsNeeded: 4, - wantResult: NewCPUSet(8, 10, 12, 14), - }, - { - name: "allocate overlapped cpus with same CPUMutex serviceUnits", - topology: buildCPUTopologyForTest(2, 1, 4, 2), - cpuMutexServiceUnits: sets.NewString("test-app-1-host"), - currentAppName: "test-app-1", - currentServiceUnit: "test-app-1-host", - numCPUsNeeded: 10, - wantResult: NewCPUSet(0, 1, 2, 3, 4, 6, 8, 10, 12, 14), - }, - { - name: "allocate cpus on large-size partially-allocated socket with other CPUMutex serviceUnits", - topology: buildCPUTopologyForTest(2, 1, 8, 2), - allocatedCPUs: NewCPUSet(0, 2), - cpusInApps: NewCPUSet(0, 2), - cpuMutexServiceUnits: sets.NewString("test-app-1-host", "test-app-2-host"), - currentAppName: "test-app-2", - currentServiceUnit: "test-app-2-host", - numCPUsNeeded: 4, - wantResult: NewCPUSet(4, 6, 8, 10), + name: "allocate cpus on full-free socket with PCPULevel", + topology: buildCPUTopologyForTest(2, 1, 4, 2), + allocatedExclusiveCPUs: NewCPUSet(0, 2), + numCPUsNeeded: 4, + wantResult: NewCPUSet(8, 10, 12, 14), + }, + { + name: "allocate overlapped cpus with PCPULevel", + topology: buildCPUTopologyForTest(2, 1, 4, 2), + numCPUsNeeded: 10, + wantResult: NewCPUSet(0, 1, 2, 3, 4, 6, 8, 10, 12, 14), + }, + { + name: "allocate cpus on large-size partially-allocated socket with PCPULevel", + topology: buildCPUTopologyForTest(2, 1, 8, 2), + allocatedExclusiveCPUs: NewCPUSet(0, 2), + numCPUsNeeded: 4, + wantResult: NewCPUSet(4, 6, 8, 10), + }, + { + name: "allocate cpus with none exclusive policy", + topology: buildCPUTopologyForTest(2, 1, 8, 2), + allocatedExclusiveCPUs: NewCPUSet(0, 2), + exclusivePolicy: schedulingconfig.CPUExclusivePolicyNone, + numCPUsNeeded: 4, + wantResult: NewCPUSet(1, 3, 4, 6), + }, + { + name: "allocate cpus on full-free socket with NUMANodeLevel", + topology: buildCPUTopologyForTest(2, 1, 4, 2), + allocatedExclusiveCPUs: NewCPUSet(0, 2), + allocatedExclusivePolicy: schedulingconfig.CPUExclusivePolicyNUMANodeLevel, + exclusivePolicy: schedulingconfig.CPUExclusivePolicyNUMANodeLevel, + numCPUsNeeded: 4, + wantResult: NewCPUSet(8, 10, 12, 14), + }, + { + name: "allocate cpus on partially-allocated socket without NUMANodeLevel", + topology: buildCPUTopologyForTest(2, 1, 4, 2), + allocatedExclusiveCPUs: NewCPUSet(0, 2), + allocatedExclusivePolicy: schedulingconfig.CPUExclusivePolicyNUMANodeLevel, + exclusivePolicy: schedulingconfig.CPUExclusivePolicyNone, + numCPUsNeeded: 4, + wantResult: NewCPUSet(1, 3, 4, 6), + }, + { + name: "allocate cpus on full-free socket with NUMANodeLevel with PCPUs", + topology: buildCPUTopologyForTest(2, 1, 4, 2), + allocatedExclusiveCPUs: NewCPUSet(0, 2), + allocatedExclusivePolicy: schedulingconfig.CPUExclusivePolicyNUMANodeLevel, + exclusivePolicy: schedulingconfig.CPUExclusivePolicyNUMANodeLevel, + bindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 4, + wantResult: NewCPUSet(8, 9, 10, 11), + }, + { + name: "allocate cpus on partially-allocated socket without NUMANodeLevel with PCPUs", + topology: buildCPUTopologyForTest(2, 1, 4, 2), + allocatedExclusiveCPUs: NewCPUSet(0, 2), + allocatedExclusivePolicy: schedulingconfig.CPUExclusivePolicyNUMANodeLevel, + exclusivePolicy: schedulingconfig.CPUExclusivePolicyNone, + bindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 4, + wantResult: NewCPUSet(4, 5, 6, 7), }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - availableCPUs := tt.topology.CPUDetails.CPUs().Difference(tt.allocatedCPUs) - allocatedCPUsDetails := tt.topology.CPUDetails.KeepOnly(tt.allocatedCPUs) - for _, cpuID := range tt.cpusInApps.ToSliceNoSort() { + availableCPUs := tt.topology.CPUDetails.CPUs().Difference(tt.allocatedExclusiveCPUs) + allocatedCPUsDetails := tt.topology.CPUDetails.KeepOnly(tt.allocatedExclusiveCPUs) + for _, cpuID := range tt.allocatedExclusiveCPUs.ToSliceNoSort() { cpuInfo := allocatedCPUsDetails[cpuID] - cpuInfo.Exclusive = true + if tt.allocatedExclusivePolicy != "" { + cpuInfo.ExclusivePolicy = tt.allocatedExclusivePolicy + } else { + cpuInfo.ExclusivePolicy = schedulingconfig.CPUExclusivePolicyPCPULevel + } allocatedCPUsDetails[cpuID] = cpuInfo } + if tt.exclusivePolicy == "" { + tt.exclusivePolicy = schedulingconfig.CPUExclusivePolicyPCPULevel + } + if tt.bindPolicy == "" { + tt.bindPolicy = schedulingconfig.CPUBindPolicySpreadByPCPUs + } + result, err := takeCPUs( tt.topology, availableCPUs, allocatedCPUsDetails, - tt.numCPUsNeeded, schedulingconfig.CPUBindPolicySpreadByPCPUs, true, schedulingconfig.NUMAMostAllocated) + tt.numCPUsNeeded, tt.bindPolicy, tt.exclusivePolicy, schedulingconfig.NUMAMostAllocated) if tt.wantError && err == nil { t.Fatal("expect error but got nil") } else if !tt.wantError && err != nil { @@ -502,26 +510,6 @@ func TestTakeSpreadByPCPUsWithCoreLevelExclusive(t *testing.T) { } } -func TestCPUSpreadByPCPUs(t *testing.T) { - topology := buildCPUTopologyForTest(2, 2, 4, 2) - acc := newCPUAccumulator(topology, topology.CPUDetails.CPUs(), nil, 8, false, schedulingconfig.NUMAMostAllocated) - result := acc.freeCPUs(false) - result = acc.spreadCPUs(result) - if !reflect.DeepEqual([]int{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}, result) { - t.Fatal("unexpect spread result") - } -} - -func TestCPUSpreadByPCPUsWithNUMALeastAllocated(t *testing.T) { - topology := buildCPUTopologyForTest(2, 2, 4, 2) - acc := newCPUAccumulator(topology, topology.CPUDetails.CPUs(), nil, 8, false, schedulingconfig.NUMALeastAllocated) - result := acc.freeCPUs(false) - result = acc.spreadCPUs(result) - if !reflect.DeepEqual([]int{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}, result) { - t.Fatal("unexpect spread result") - } -} - func BenchmarkTakeCPUsWithSameCoreFirst(b *testing.B) { tests := []struct { name string @@ -564,7 +552,7 @@ func BenchmarkTakeCPUsWithSameCoreFirst(b *testing.B) { b.Run(tt.name, func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := takeCPUs( - topology, cpus, nil, tt.numCPUsNeeded, schedulingconfig.CPUBindPolicyFullPCPUs, false, schedulingconfig.NUMAMostAllocated) + topology, cpus, nil, tt.numCPUsNeeded, schedulingconfig.CPUBindPolicyFullPCPUs, schedulingconfig.CPUExclusivePolicyNone, schedulingconfig.NUMAMostAllocated) if err != nil { b.Fatal(err) } @@ -616,7 +604,7 @@ func BenchmarkTakeCPUsWithSpread(b *testing.B) { b.Run(tt.name, func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := takeCPUs( - topology, cpus, nil, tt.numCPUsNeeded, schedulingconfig.CPUBindPolicySpreadByPCPUs, false, schedulingconfig.NUMAMostAllocated) + topology, cpus, nil, tt.numCPUsNeeded, schedulingconfig.CPUBindPolicySpreadByPCPUs, schedulingconfig.CPUExclusivePolicyNone, schedulingconfig.NUMAMostAllocated) if err != nil { b.Fatal(err) } diff --git a/pkg/scheduler/plugins/nodenumaresource/cpu_topology.go b/pkg/scheduler/plugins/nodenumaresource/cpu_topology.go index 1c46604e5..9f26e7677 100644 --- a/pkg/scheduler/plugins/nodenumaresource/cpu_topology.go +++ b/pkg/scheduler/plugins/nodenumaresource/cpu_topology.go @@ -16,6 +16,10 @@ limitations under the License. package nodenumaresource +import ( + "github.com/koordinator-sh/koordinator/apis/extension" +) + // CPUTopology contains details of node cpu type CPUTopology struct { NumCPUs int @@ -64,12 +68,12 @@ func NewCPUDetails() CPUDetails { // CPUInfo contains the NUMA, socket, and core IDs associated with a CPU. type CPUInfo struct { - CPUID int - CoreID int - NodeID int - SocketID int - RefCount int - Exclusive bool + CPUID int + CoreID int + NodeID int + SocketID int + RefCount int + ExclusivePolicy extension.CPUExclusivePolicy } // Clone clones the CPUDetails diff --git a/pkg/scheduler/plugins/nodenumaresource/node_numa_info.go b/pkg/scheduler/plugins/nodenumaresource/node_numa_info.go index 3415827d8..695f4e80c 100644 --- a/pkg/scheduler/plugins/nodenumaresource/node_numa_info.go +++ b/pkg/scheduler/plugins/nodenumaresource/node_numa_info.go @@ -25,6 +25,7 @@ import ( "k8s.io/client-go/tools/cache" "github.com/koordinator-sh/koordinator/apis/extension" + schedulingconfig "github.com/koordinator-sh/koordinator/apis/scheduling/config" "github.com/koordinator-sh/koordinator/pkg/util" ) @@ -179,6 +180,12 @@ func (c *nodeNumaInfoCache) setPod(pod *corev1.Pod) { if err != nil { return } + + resourceSpec, err := extension.GetResourceSpec(pod.Annotations) + if err != nil { + return + } + cpuset, err := Parse(resourceStatus.CPUSet) if err != nil || cpuset.IsEmpty() { return @@ -186,7 +193,7 @@ func (c *nodeNumaInfoCache) setPod(pod *corev1.Pod) { numaInfo.lock.Lock() defer numaInfo.lock.Unlock() - numaInfo.allocateCPUs(pod.UID, cpuset) + numaInfo.allocateCPUs(pod.UID, cpuset, resourceSpec.PreferredCPUExclusivePolicy) } func (c *nodeNumaInfoCache) deletePod(pod *corev1.Pod) { @@ -217,7 +224,7 @@ func (n *nodeNUMAInfo) updateCPUTopology(topology *CPUTopology) { n.cpuTopology = topology } -func (n *nodeNUMAInfo) allocateCPUs(podUID types.UID, cpuset CPUSet) { +func (n *nodeNUMAInfo) allocateCPUs(podUID types.UID, cpuset CPUSet, exclusivePolicy schedulingconfig.CPUExclusivePolicy) { if _, ok := n.allocatedPods[podUID]; ok { return } @@ -228,6 +235,7 @@ func (n *nodeNUMAInfo) allocateCPUs(podUID types.UID, cpuset CPUSet) { if !ok { cpuInfo = n.cpuTopology.CPUDetails[cpuID] } + cpuInfo.ExclusivePolicy = exclusivePolicy cpuInfo.RefCount++ n.allocatedCPUs[cpuID] = cpuInfo } diff --git a/pkg/scheduler/plugins/nodenumaresource/plugin.go b/pkg/scheduler/plugins/nodenumaresource/plugin.go index ec9bdb88d..78a0ab9e1 100644 --- a/pkg/scheduler/plugins/nodenumaresource/plugin.go +++ b/pkg/scheduler/plugins/nodenumaresource/plugin.go @@ -53,6 +53,8 @@ const ( const ( ErrMissingNodeResourceTopology = "node(s) missing NodeResourceTopology" ErrInvalidCPUTopology = "node(s) invalid CPU Topology" + ErrSMTAlignmentError = "node(s) requested cpus not multiple cpus per core" + ErrRequiredFullPCPUsPolicy = "node(s) required FullPCPUs policy" ) var ( @@ -105,10 +107,12 @@ func New(args runtime.Object, handle framework.Handle) (framework.Plugin, error) func (p *Plugin) Name() string { return Name } type preFilterState struct { - skip bool - resourceSpec *extension.ResourceSpec - numCPUsNeeded int - allocatedCPUs CPUSet + skip bool + resourceSpec *extension.ResourceSpec + preferredCPUBindPolicy schedulingconfig.CPUBindPolicy + preferredCPUExclusivePolicy schedulingconfig.CPUExclusivePolicy + numCPUsNeeded int + allocatedCPUs CPUSet } func (s *preFilterState) Clone() framework.StateData { @@ -132,8 +136,12 @@ func (p *Plugin) PreFilter(ctx context.Context, cycleState *framework.CycleState qosClass := extension.GetPodQoSClass(pod) priorityClass := extension.GetPriorityClass(pod) if (qosClass == extension.QoSLSE || qosClass == extension.QoSLSR) && priorityClass == extension.PriorityProd { - if resourceSpec.PreferredCPUBindPolicy == schedulingconfig.CPUBindPolicyFullPCPUs || - resourceSpec.PreferredCPUBindPolicy == schedulingconfig.CPUBindPolicySpreadByPCPUs { + preferredCPUBindPolicy := resourceSpec.PreferredCPUBindPolicy + if preferredCPUBindPolicy == "" || preferredCPUBindPolicy == schedulingconfig.CPUBindPolicyDefault { + preferredCPUBindPolicy = p.pluginArgs.DefaultCPUBindPolicy + } + if preferredCPUBindPolicy == schedulingconfig.CPUBindPolicyFullPCPUs || + preferredCPUBindPolicy == schedulingconfig.CPUBindPolicySpreadByPCPUs { requests, _ := resourceapi.PodRequestsAndLimits(pod) requestedCPU := requests.Cpu().MilliValue() if requestedCPU%1000 != 0 { @@ -143,6 +151,8 @@ func (p *Plugin) PreFilter(ctx context.Context, cycleState *framework.CycleState if requestedCPU > 0 { state.skip = false state.resourceSpec = resourceSpec + state.preferredCPUBindPolicy = preferredCPUBindPolicy + state.preferredCPUExclusivePolicy = resourceSpec.PreferredCPUExclusivePolicy state.numCPUsNeeded = int(requestedCPU / 1000) } } @@ -193,6 +203,15 @@ func (p *Plugin) Filter(ctx context.Context, cycleState *framework.CycleState, p return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrInvalidCPUTopology) } + if node.Labels[extension.LabelNodeCPUBindPolicy] == extension.NodeCPUBindPolicyFullPCPUsOnly { + if state.numCPUsNeeded%numaInfo.cpuTopology.CPUsPerCore() != 0 { + return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrSMTAlignmentError) + } + if state.preferredCPUBindPolicy != schedulingconfig.CPUBindPolicyFullPCPUs { + return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrRequiredFullPCPUsPolicy) + } + } + return nil } @@ -226,25 +245,37 @@ func (p *Plugin) Score(ctx context.Context, cycleState *framework.CycleState, po return 0, nil } - score := p.calcScore(state.numCPUsNeeded, state.resourceSpec, numaInfo) + numaAllocateStrategy := p.getNUMAAllocateStrategy(node) + score := p.calcScore(numaInfo, state.numCPUsNeeded, state.preferredCPUBindPolicy, state.preferredCPUExclusivePolicy, numaAllocateStrategy) return score, nil } -func (p *Plugin) calcScore(numCPUsNeeded int, resourceSpec *extension.ResourceSpec, numaInfo *nodeNUMAInfo) int64 { +func (p *Plugin) getNUMAAllocateStrategy(node *corev1.Node) schedulingconfig.NUMAAllocateStrategy { + numaAllocateStrategy := schedulingconfig.NUMAMostAllocated + if p.pluginArgs.ScoringStrategy != nil && p.pluginArgs.ScoringStrategy.Type == schedulingconfig.LeastAllocated { + numaAllocateStrategy = schedulingconfig.NUMALeastAllocated + } + if val := schedulingconfig.NUMAAllocateStrategy(node.Labels[extension.LabelNodeNUMAAllocateStrategy]); val != "" { + numaAllocateStrategy = val + } + return numaAllocateStrategy +} + +func (p *Plugin) calcScore(numaInfo *nodeNUMAInfo, numCPUsNeeded int, cpuBindPolicy schedulingconfig.CPUBindPolicy, cpuExclusivePolicy schedulingconfig.CPUExclusivePolicy, numaAllocateStrategy schedulingconfig.NUMAAllocateStrategy) int64 { availableCPUs, allocated := getAvailableCPUsFunc(numaInfo) acc := newCPUAccumulator( numaInfo.cpuTopology, availableCPUs, allocated, numCPUsNeeded, - false, - p.pluginArgs.NUMAAllocateStrategy, + cpuExclusivePolicy, + numaAllocateStrategy, ) var freeCPUs [][]int - if resourceSpec.PreferredCPUBindPolicy == schedulingconfig.CPUBindPolicyFullPCPUs { + if cpuBindPolicy == schedulingconfig.CPUBindPolicyFullPCPUs { if numCPUsNeeded <= numaInfo.cpuTopology.CPUsPerNode() { - freeCPUs = acc.freeCoresInNode(true) + freeCPUs = acc.freeCoresInNode(true, true) } else if numCPUsNeeded <= numaInfo.cpuTopology.CPUsPerSocket() { freeCPUs = acc.freeCoresInSocket(true) } @@ -257,7 +288,7 @@ func (p *Plugin) calcScore(numCPUsNeeded int, resourceSpec *extension.ResourceSp } scoreFn := mostRequestedScore - if p.pluginArgs.ScoringStrategy != nil && p.pluginArgs.ScoringStrategy.Type == schedulingconfig.LeastAllocated { + if numaAllocateStrategy == schedulingconfig.NUMALeastAllocated { scoreFn = leastRequestedScore } @@ -327,6 +358,15 @@ func (p *Plugin) Reserve(ctx context.Context, cycleState *framework.CycleState, return nil } + nodeInfo, err := p.handle.SnapshotSharedLister().NodeInfos().Get(nodeName) + if err != nil { + return framework.NewStatus(framework.Error, fmt.Sprintf("getting node %q from Snapshot: %v", nodeName, err)) + } + node := nodeInfo.Node() + if node == nil { + return framework.NewStatus(framework.Error, "node not found") + } + // The Pod requires the CPU to be allocated according to CPUBindPolicy, // but the current node does not have a NodeResourceTopology or a valid CPUTopology, // so this error should be exposed to the user @@ -342,20 +382,21 @@ func (p *Plugin) Reserve(ctx context.Context, cycleState *framework.CycleState, } availableCPUs, allocated := getAvailableCPUsFunc(numaInfo) + numaAllocateStrategy := p.getNUMAAllocateStrategy(node) result, err := takeCPUs( numaInfo.cpuTopology, availableCPUs, allocated, state.numCPUsNeeded, - state.resourceSpec.PreferredCPUBindPolicy, - false, - p.pluginArgs.NUMAAllocateStrategy, + state.preferredCPUBindPolicy, + state.resourceSpec.PreferredCPUExclusivePolicy, + numaAllocateStrategy, ) if err != nil { return framework.NewStatus(framework.Error, err.Error()) } - numaInfo.allocateCPUs(pod.UID, result) + numaInfo.allocateCPUs(pod.UID, result, state.preferredCPUExclusivePolicy) state.allocatedCPUs = result return nil } @@ -410,6 +451,19 @@ func (p *Plugin) PreBind(ctx context.Context, cycleState *framework.CycleState, } pod.Annotations[extension.AnnotationResourceStatus] = string(data) + // Write back ResourceSpec annotation if LSR Pod hasn't specified CPUBindPolicy + if state.resourceSpec.PreferredCPUBindPolicy == "" || + state.resourceSpec.PreferredCPUBindPolicy == schedulingconfig.CPUBindPolicyDefault { + resourceSpec := &extension.ResourceSpec{ + PreferredCPUBindPolicy: p.pluginArgs.DefaultCPUBindPolicy, + } + data, err = json.Marshal(resourceSpec) + if err != nil { + return framework.NewStatus(framework.Error, err.Error()) + } + pod.Annotations[extension.AnnotationResourceSpec] = string(data) + } + patchBytes, err := generatePodPatch(podOriginal, pod) if err != nil { return framework.NewStatus(framework.Error, err.Error()) diff --git a/pkg/scheduler/plugins/nodenumaresource/plugin_test.go b/pkg/scheduler/plugins/nodenumaresource/plugin_test.go index fd62b1493..069ee42f7 100644 --- a/pkg/scheduler/plugins/nodenumaresource/plugin_test.go +++ b/pkg/scheduler/plugins/nodenumaresource/plugin_test.go @@ -39,8 +39,8 @@ import ( "k8s.io/utils/pointer" "github.com/koordinator-sh/koordinator/apis/extension" - "github.com/koordinator-sh/koordinator/apis/scheduling/config" - "github.com/koordinator-sh/koordinator/apis/scheduling/config/v1beta2" + schedulingconfig "github.com/koordinator-sh/koordinator/apis/scheduling/config" + schedulingconfigv1beta2 "github.com/koordinator-sh/koordinator/apis/scheduling/config/v1beta2" koordinatorclientset "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned" koordfake "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned/fake" koordinatorinformers "github.com/koordinator-sh/koordinator/pkg/client/informers/externalversions" @@ -109,14 +109,14 @@ type pluginTestSuit struct { koordinatorSharedInformerFactory koordinatorinformers.SharedInformerFactory nrtSharedInformerFactory nrtinformers.SharedInformerFactory proxyNew runtime.PluginFactory - nodeNUMAResourceArgs *config.NodeNUMAResourceArgs + nodeNUMAResourceArgs *schedulingconfig.NodeNUMAResourceArgs } func newPluginTestSuit(t *testing.T, nodes []*corev1.Node) *pluginTestSuit { - var v1beta2args v1beta2.NodeNUMAResourceArgs - v1beta2.SetDefaults_NodeNUMAResourceArgs(&v1beta2args) - var nodeNUMAResourceArgs config.NodeNUMAResourceArgs - err := v1beta2.Convert_v1beta2_NodeNUMAResourceArgs_To_config_NodeNUMAResourceArgs(&v1beta2args, &nodeNUMAResourceArgs, nil) + var v1beta2args schedulingconfigv1beta2.NodeNUMAResourceArgs + schedulingconfigv1beta2.SetDefaults_NodeNUMAResourceArgs(&v1beta2args) + var nodeNUMAResourceArgs schedulingconfig.NodeNUMAResourceArgs + err := schedulingconfigv1beta2.Convert_v1beta2_NodeNUMAResourceArgs_To_config_NodeNUMAResourceArgs(&v1beta2args, &nodeNUMAResourceArgs, nil) assert.NoError(t, err) nodeNUMAResourcePluginConfig := scheduledconfig.PluginConfig{ @@ -222,9 +222,10 @@ func TestPlugin_PreFilter(t *testing.T) { }, }, wantState: &preFilterState{ - skip: false, - resourceSpec: &extension.ResourceSpec{PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs}, - numCPUsNeeded: 4, + skip: false, + resourceSpec: &extension.ResourceSpec{PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs}, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 4, }, }, { @@ -253,9 +254,39 @@ func TestPlugin_PreFilter(t *testing.T) { }, }, wantState: &preFilterState{ - skip: false, - resourceSpec: &extension.ResourceSpec{PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs}, - numCPUsNeeded: 4, + skip: false, + resourceSpec: &extension.ResourceSpec{PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs}, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 4, + }, + }, + { + name: "cpu set with LSR Prod Pod but not specified CPUBindPolicy", + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + extension.LabelPodQoS: string(extension.QoSLSR), + }, + }, + Spec: corev1.PodSpec{ + Priority: pointer.Int32(extension.PriorityProdValueMax), + Containers: []corev1.Container{ + { + Name: "container-1", + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + }, + }, + }, + }, + }, + }, + wantState: &preFilterState{ + skip: false, + resourceSpec: &extension.ResourceSpec{PreferredCPUBindPolicy: extension.CPUBindPolicyDefault}, + preferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 4, }, }, { @@ -385,11 +416,12 @@ func TestPlugin_PreFilter(t *testing.T) { func TestPlugin_Filter(t *testing.T) { tests := []struct { - name string - state *preFilterState - pod *corev1.Pod - numaInfo *nodeNUMAInfo - want *framework.Status + name string + nodeLabels map[string]string + state *preFilterState + pod *corev1.Pod + numaInfo *nodeNUMAInfo + want *framework.Status }{ { name: "error with missing preFilterState", @@ -430,13 +462,44 @@ func TestPlugin_Filter(t *testing.T) { pod: &corev1.Pod{}, want: nil, }, + { + name: "verify FullPCPUsOnly with SMTAlignmentError", + nodeLabels: map[string]string{ + extension.LabelNodeCPUBindPolicy: extension.NodeCPUBindPolicyFullPCPUsOnly, + }, + state: &preFilterState{ + skip: false, + resourceSpec: &extension.ResourceSpec{}, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 5, + }, + numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), + pod: &corev1.Pod{}, + want: framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrSMTAlignmentError), + }, + { + name: "verify FullPCPUsOnly with RequiredFullPCPUsPolicy", + nodeLabels: map[string]string{ + extension.LabelNodeCPUBindPolicy: extension.NodeCPUBindPolicyFullPCPUsOnly, + }, + state: &preFilterState{ + skip: false, + resourceSpec: &extension.ResourceSpec{}, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicySpreadByPCPUs, + numCPUsNeeded: 4, + }, + numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), + pod: &corev1.Pod{}, + want: framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrRequiredFullPCPUsPolicy), + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { nodes := []*corev1.Node{ { ObjectMeta: metav1.ObjectMeta{ - Name: "test-node-1", + Name: "test-node-1", + Labels: map[string]string{}, }, Status: corev1.NodeStatus{ Allocatable: corev1.ResourceList{ @@ -446,6 +509,10 @@ func TestPlugin_Filter(t *testing.T) { }, }, } + for k, v := range tt.nodeLabels { + nodes[0].Labels[k] = v + } + suit := newPluginTestSuit(t, nodes) p, err := suit.proxyNew(suit.nodeNUMAResourceArgs, suit.Handle) assert.NotNil(t, p) @@ -476,12 +543,13 @@ func TestPlugin_Filter(t *testing.T) { func TestPlugin_Score(t *testing.T) { tests := []struct { - name string - state *preFilterState - pod *corev1.Pod - numaInfo *nodeNUMAInfo - want *framework.Status - wantScore int64 + name string + nodeLabels map[string]string + state *preFilterState + pod *corev1.Pod + numaInfo *nodeNUMAInfo + want *framework.Status + wantScore int64 }{ { name: "error with missing preFilterState", @@ -523,7 +591,8 @@ func TestPlugin_Score(t *testing.T) { resourceSpec: &extension.ResourceSpec{ PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, }, - numCPUsNeeded: 4, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 4, }, numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), pod: &corev1.Pod{}, @@ -537,7 +606,8 @@ func TestPlugin_Score(t *testing.T) { resourceSpec: &extension.ResourceSpec{ PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, }, - numCPUsNeeded: 8, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 8, }, numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), pod: &corev1.Pod{}, @@ -551,7 +621,8 @@ func TestPlugin_Score(t *testing.T) { resourceSpec: &extension.ResourceSpec{ PreferredCPUBindPolicy: extension.CPUBindPolicySpreadByPCPUs, }, - numCPUsNeeded: 4, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicySpreadByPCPUs, + numCPUsNeeded: 4, }, numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), pod: &corev1.Pod{}, @@ -565,7 +636,8 @@ func TestPlugin_Score(t *testing.T) { resourceSpec: &extension.ResourceSpec{ PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, }, - numCPUsNeeded: 16, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 16, }, numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), pod: &corev1.Pod{}, @@ -579,7 +651,8 @@ func TestPlugin_Score(t *testing.T) { resourceSpec: &extension.ResourceSpec{ PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, }, - numCPUsNeeded: 16, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + numCPUsNeeded: 16, }, numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 2, 4, 2)), pod: &corev1.Pod{}, @@ -593,13 +666,32 @@ func TestPlugin_Score(t *testing.T) { resourceSpec: &extension.ResourceSpec{ PreferredCPUBindPolicy: extension.CPUBindPolicySpreadByPCPUs, }, - numCPUsNeeded: 4, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicySpreadByPCPUs, + numCPUsNeeded: 4, }, numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), pod: &corev1.Pod{}, want: nil, wantScore: 100, }, + { + name: "score with Node NUMA Allocate Strategy", + nodeLabels: map[string]string{ + extension.LabelNodeNUMAAllocateStrategy: extension.NodeNUMAAllocateStrategyLeastAllocated, + }, + state: &preFilterState{ + skip: false, + resourceSpec: &extension.ResourceSpec{ + PreferredCPUBindPolicy: extension.CPUBindPolicySpreadByPCPUs, + }, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicySpreadByPCPUs, + numCPUsNeeded: 2, + }, + numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), + pod: &corev1.Pod{}, + want: nil, + wantScore: 50, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -607,7 +699,8 @@ func TestPlugin_Score(t *testing.T) { nodes := []*corev1.Node{ { ObjectMeta: metav1.ObjectMeta{ - Name: "test-node-1", + Name: "test-node-1", + Labels: map[string]string{}, }, Status: corev1.NodeStatus{ Allocatable: corev1.ResourceList{ @@ -617,6 +710,10 @@ func TestPlugin_Score(t *testing.T) { }, }, } + for k, v := range tt.nodeLabels { + nodes[0].Labels[k] = v + } + suit := newPluginTestSuit(t, nodes) p, err := suit.proxyNew(suit.nodeNUMAResourceArgs, suit.Handle) assert.NotNil(t, p) @@ -652,12 +749,14 @@ func TestPlugin_Score(t *testing.T) { func TestPlugin_Reserve(t *testing.T) { tests := []struct { - name string - state *preFilterState - pod *corev1.Pod - numaInfo *nodeNUMAInfo - want *framework.Status - wantCPUSet CPUSet + name string + nodeLabels map[string]string + state *preFilterState + pod *corev1.Pod + numaInfo *nodeNUMAInfo + allocatedCPUs []int + want *framework.Status + wantCPUSet CPUSet }{ { name: "error with missing preFilterState", @@ -697,6 +796,7 @@ func TestPlugin_Reserve(t *testing.T) { resourceSpec: &extension.ResourceSpec{ PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, }, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, }, numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 4, 2)), pod: &corev1.Pod{}, @@ -716,13 +816,52 @@ func TestPlugin_Reserve(t *testing.T) { pod: &corev1.Pod{}, want: framework.NewStatus(framework.Error, "not enough cpus available to satisfy request"), }, + { + name: "succeed with valid cpu topology and node numa least allocate strategy", + nodeLabels: map[string]string{ + extension.LabelNodeNUMAAllocateStrategy: extension.NodeNUMAAllocateStrategyLeastAllocated, + }, + state: &preFilterState{ + skip: false, + numCPUsNeeded: 4, + resourceSpec: &extension.ResourceSpec{ + PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, + }, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + }, + numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 8, 2)), + allocatedCPUs: []int{0, 1, 2, 3}, + pod: &corev1.Pod{}, + want: nil, + wantCPUSet: NewCPUSet(16, 17, 18, 19), + }, + { + name: "succeed with valid cpu topology and node numa most allocate strategy", + nodeLabels: map[string]string{ + extension.LabelNodeNUMAAllocateStrategy: extension.NodeNUMAAllocateStrategyMostAllocated, + }, + state: &preFilterState{ + skip: false, + numCPUsNeeded: 4, + resourceSpec: &extension.ResourceSpec{ + PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, + }, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + }, + numaInfo: newNodeNUMAInfo("test-node-1", buildCPUTopologyForTest(2, 1, 8, 2)), + allocatedCPUs: []int{0, 1, 2, 3}, + pod: &corev1.Pod{}, + want: nil, + wantCPUSet: NewCPUSet(4, 5, 6, 7), + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { nodes := []*corev1.Node{ { ObjectMeta: metav1.ObjectMeta{ - Name: "test-node-1", + Name: "test-node-1", + Labels: map[string]string{}, }, Status: corev1.NodeStatus{ Allocatable: corev1.ResourceList{ @@ -732,6 +871,10 @@ func TestPlugin_Reserve(t *testing.T) { }, }, } + for k, v := range tt.nodeLabels { + nodes[0].Labels[k] = v + } + suit := newPluginTestSuit(t, nodes) p, err := suit.proxyNew(suit.nodeNUMAResourceArgs, suit.Handle) assert.NotNil(t, p) @@ -739,6 +882,9 @@ func TestPlugin_Reserve(t *testing.T) { plg := p.(*Plugin) if tt.numaInfo != nil { + if len(tt.allocatedCPUs) > 0 { + tt.numaInfo.allocateCPUs(uuid.NewUUID(), NewCPUSet(tt.allocatedCPUs...), schedulingconfig.CPUExclusivePolicyNone) + } plg.nodeInfoCache.nodes[tt.numaInfo.nodeName] = tt.numaInfo } @@ -781,7 +927,7 @@ func TestPlugin_Unreserve(t *testing.T) { }, } - numaInfo.allocateCPUs(pod.UID, state.allocatedCPUs) + numaInfo.allocateCPUs(pod.UID, state.allocatedCPUs, schedulingconfig.CPUExclusivePolicyNone) cycleState := framework.NewCycleState() cycleState.Write(stateKey, state) plg := &Plugin{ @@ -838,3 +984,57 @@ func TestPlugin_PreBind(t *testing.T) { } assert.Equal(t, expectResourceStatus, resourceStatus) } + +func TestPlugin_PreBindWithCPUBindPolicyNone(t *testing.T) { + suit := newPluginTestSuit(t, nil) + p, err := suit.proxyNew(suit.nodeNUMAResourceArgs, suit.Handle) + assert.NotNil(t, p) + assert.Nil(t, err) + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: uuid.NewUUID(), + Namespace: "default", + Name: "test-pod-1", + }, + } + + _, status := suit.Handle.ClientSet().CoreV1().Pods("default").Create(context.TODO(), pod, metav1.CreateOptions{}) + assert.Nil(t, status) + + suit.start() + + plg := p.(*Plugin) + + state := &preFilterState{ + skip: false, + numCPUsNeeded: 4, + resourceSpec: &extension.ResourceSpec{ + PreferredCPUBindPolicy: extension.CPUBindPolicyDefault, + }, + preferredCPUBindPolicy: schedulingconfig.CPUBindPolicyFullPCPUs, + allocatedCPUs: NewCPUSet(0, 1, 2, 3), + } + cycleState := framework.NewCycleState() + cycleState.Write(stateKey, state) + + s := plg.PreBind(context.TODO(), cycleState, pod, "test-node-1") + assert.True(t, s.IsSuccess()) + podModified, status := suit.Handle.ClientSet().CoreV1().Pods("default").Get(context.TODO(), "test-pod-1", metav1.GetOptions{}) + assert.Nil(t, status) + assert.NotNil(t, podModified) + resourceStatus, err := extension.GetResourceStatus(podModified.Annotations) + assert.NoError(t, err) + assert.NotNil(t, resourceStatus) + expectResourceStatus := &extension.ResourceStatus{ + CPUSet: "0-3", + } + assert.Equal(t, expectResourceStatus, resourceStatus) + resourceSpec, err := extension.GetResourceSpec(podModified.Annotations) + assert.NoError(t, err) + assert.NotNil(t, resourceSpec) + expectedResourceSpec := &extension.ResourceSpec{ + PreferredCPUBindPolicy: extension.CPUBindPolicyFullPCPUs, + } + assert.Equal(t, expectedResourceSpec, resourceSpec) +} diff --git a/pkg/slo-controller/config/config.go b/pkg/slo-controller/config/config.go index 99a46decd..c5dea7cf0 100644 --- a/pkg/slo-controller/config/config.go +++ b/pkg/slo-controller/config/config.go @@ -68,16 +68,16 @@ type CPUBurstCfg struct { } // +k8s:deepcopy-gen=true -type ResourceQoSCfg struct { - ClusterStrategy *slov1alpha1.ResourceQoSStrategy `json:"clusterStrategy,omitempty"` - NodeStrategies []NodeResourceQoSStrategy `json:"nodeStrategies,omitempty"` +type ResourceQOSCfg struct { + ClusterStrategy *slov1alpha1.ResourceQOSStrategy `json:"clusterStrategy,omitempty"` + NodeStrategies []NodeResourceQOSStrategy `json:"nodeStrategies,omitempty"` } // +k8s:deepcopy-gen=true -type NodeResourceQoSStrategy struct { +type NodeResourceQOSStrategy struct { // an empty label selector matches all objects while a nil label selector matches no objects NodeSelector *metav1.LabelSelector `json:"nodeSelector,omitempty"` - *slov1alpha1.ResourceQoSStrategy + *slov1alpha1.ResourceQOSStrategy } // +k8s:deepcopy-gen=true diff --git a/pkg/slo-controller/config/constants.go b/pkg/slo-controller/config/constants.go index b3b08cd37..2503be22c 100644 --- a/pkg/slo-controller/config/constants.go +++ b/pkg/slo-controller/config/constants.go @@ -24,7 +24,7 @@ const ( // keys in the configmap ColocationConfigKey = "colocation-config" ResourceThresholdConfigKey = "resource-threshold-config" - ResourceQoSConfigKey = "resource-qos-config" + ResourceQOSConfigKey = "resource-qos-config" CPUBurstConfigKey = "cpu-burst-config" ) @@ -33,7 +33,7 @@ Koordinator uses configmap to manage the configuration of SLO, the configmap is /, with the following keys respectively: - - - - + - - et. diff --git a/pkg/slo-controller/config/zz_generated.deepcopy.go b/pkg/slo-controller/config/zz_generated.deepcopy.go index b3105d6b5..3179b9ef1 100644 --- a/pkg/slo-controller/config/zz_generated.deepcopy.go +++ b/pkg/slo-controller/config/zz_generated.deepcopy.go @@ -178,26 +178,26 @@ func (in *NodeColocationCfg) DeepCopy() *NodeColocationCfg { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *NodeResourceQoSStrategy) DeepCopyInto(out *NodeResourceQoSStrategy) { +func (in *NodeResourceQOSStrategy) DeepCopyInto(out *NodeResourceQOSStrategy) { *out = *in if in.NodeSelector != nil { in, out := &in.NodeSelector, &out.NodeSelector *out = new(v1.LabelSelector) (*in).DeepCopyInto(*out) } - if in.ResourceQoSStrategy != nil { - in, out := &in.ResourceQoSStrategy, &out.ResourceQoSStrategy - *out = new(v1alpha1.ResourceQoSStrategy) + if in.ResourceQOSStrategy != nil { + in, out := &in.ResourceQOSStrategy, &out.ResourceQOSStrategy + *out = new(v1alpha1.ResourceQOSStrategy) (*in).DeepCopyInto(*out) } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeResourceQoSStrategy. -func (in *NodeResourceQoSStrategy) DeepCopy() *NodeResourceQoSStrategy { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeResourceQOSStrategy. +func (in *NodeResourceQOSStrategy) DeepCopy() *NodeResourceQOSStrategy { if in == nil { return nil } - out := new(NodeResourceQoSStrategy) + out := new(NodeResourceQOSStrategy) in.DeepCopyInto(out) return out } @@ -228,28 +228,28 @@ func (in *NodeResourceThresholdStrategy) DeepCopy() *NodeResourceThresholdStrate } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ResourceQoSCfg) DeepCopyInto(out *ResourceQoSCfg) { +func (in *ResourceQOSCfg) DeepCopyInto(out *ResourceQOSCfg) { *out = *in if in.ClusterStrategy != nil { in, out := &in.ClusterStrategy, &out.ClusterStrategy - *out = new(v1alpha1.ResourceQoSStrategy) + *out = new(v1alpha1.ResourceQOSStrategy) (*in).DeepCopyInto(*out) } if in.NodeStrategies != nil { in, out := &in.NodeStrategies, &out.NodeStrategies - *out = make([]NodeResourceQoSStrategy, len(*in)) + *out = make([]NodeResourceQOSStrategy, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQoSCfg. -func (in *ResourceQoSCfg) DeepCopy() *ResourceQoSCfg { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQOSCfg. +func (in *ResourceQOSCfg) DeepCopy() *ResourceQOSCfg { if in == nil { return nil } - out := new(ResourceQoSCfg) + out := new(ResourceQOSCfg) in.DeepCopyInto(out) return out } diff --git a/pkg/slo-controller/nodeslo/nodeslo_cm_event_handler.go b/pkg/slo-controller/nodeslo/nodeslo_cm_event_handler.go index a42a21e0f..c085d7fd3 100644 --- a/pkg/slo-controller/nodeslo/nodeslo_cm_event_handler.go +++ b/pkg/slo-controller/nodeslo/nodeslo_cm_event_handler.go @@ -39,7 +39,7 @@ var _ handler.EventHandler = &SLOCfgHandlerForConfigMapEvent{} type SLOCfg struct { ThresholdCfgMerged config.ResourceThresholdCfg `json:"thresholdCfgMerged,omitempty"` - ResourceQoSCfgMerged config.ResourceQoSCfg `json:"resourceQoSCfgMerged,omitempty"` + ResourceQOSCfgMerged config.ResourceQOSCfg `json:"resourceQOSCfgMerged,omitempty"` CPUBurstCfgMerged config.CPUBurstCfg `json:"cpuBurstCfgMerged,omitempty"` } @@ -47,7 +47,7 @@ func (in *SLOCfg) DeepCopy() *SLOCfg { out := &SLOCfg{} out.ThresholdCfgMerged = *in.ThresholdCfgMerged.DeepCopy() out.CPUBurstCfgMerged = *in.CPUBurstCfgMerged.DeepCopy() - out.ResourceQoSCfgMerged = *in.ResourceQoSCfgMerged.DeepCopy() + out.ResourceQOSCfgMerged = *in.ResourceQOSCfgMerged.DeepCopy() return out } @@ -73,7 +73,7 @@ func (c *SLOCfgCache) IsAvailable() bool { func DefaultSLOCfg() SLOCfg { return SLOCfg{ ThresholdCfgMerged: config.ResourceThresholdCfg{ClusterStrategy: util.DefaultResourceThresholdStrategy()}, - ResourceQoSCfgMerged: config.ResourceQoSCfg{ClusterStrategy: &slov1alpha1.ResourceQoSStrategy{}}, + ResourceQOSCfgMerged: config.ResourceQOSCfg{ClusterStrategy: &slov1alpha1.ResourceQOSStrategy{}}, CPUBurstCfgMerged: config.CPUBurstCfg{ClusterStrategy: util.DefaultCPUBurstStrategy()}, } } @@ -118,7 +118,7 @@ func (p *SLOCfgHandlerForConfigMapEvent) syncNodeSLOSpecIfChanged(configMap *cor var newSLOCfg SLOCfg oldSLOCfgCopy := p.SLOCfgCache.sloCfg.DeepCopy() newSLOCfg.ThresholdCfgMerged, _ = caculateResourceThresholdCfgMerged(oldSLOCfgCopy.ThresholdCfgMerged, configMap) - newSLOCfg.ResourceQoSCfgMerged, _ = caculateResourceQoSCfgMerged(oldSLOCfgCopy.ResourceQoSCfgMerged, configMap) + newSLOCfg.ResourceQOSCfgMerged, _ = caculateResourceQOSCfgMerged(oldSLOCfgCopy.ResourceQOSCfgMerged, configMap) newSLOCfg.CPUBurstCfgMerged, _ = caculateCPUBurstCfgMerged(oldSLOCfgCopy.CPUBurstCfgMerged, configMap) return p.updateCacheIfChanged(newSLOCfg) diff --git a/pkg/slo-controller/nodeslo/nodeslo_cm_event_handler_test.go b/pkg/slo-controller/nodeslo/nodeslo_cm_event_handler_test.go index ec127f2e1..3ef4b84d4 100644 --- a/pkg/slo-controller/nodeslo/nodeslo_cm_event_handler_test.go +++ b/pkg/slo-controller/nodeslo/nodeslo_cm_event_handler_test.go @@ -43,11 +43,11 @@ func Test_syncNodeSLOSpecIfChanged(t *testing.T) { }, Data: map[string]string{ config.ResourceThresholdConfigKey: "{\"clusterStrategy\":{\"enable\":true,\"cpuSuppressThresholdPercent\":60}}", - config.ResourceQoSConfigKey: ` + config.ResourceQOSConfigKey: ` { "clusterStrategy": { - "be": { - "cpuQoS": { + "beClass": { + "cpuQOS": { "groupIdentity": 0 } } @@ -62,10 +62,10 @@ func Test_syncNodeSLOSpecIfChanged(t *testing.T) { expectTestingCfg1.ThresholdCfgMerged.ClusterStrategy.Enable = pointer.BoolPtr(true) expectTestingCfg1.ThresholdCfgMerged.ClusterStrategy.CPUSuppressThresholdPercent = pointer.Int64Ptr(60) - expectTestingCfg1.ResourceQoSCfgMerged.ClusterStrategy = &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + expectTestingCfg1.ResourceQOSCfgMerged.ClusterStrategy = &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, @@ -131,7 +131,7 @@ func Test_syncNodeSLOSpecIfChanged(t *testing.T) { }, Data: map[string]string{ config.ResourceThresholdConfigKey: "invalid_content", - config.ResourceQoSConfigKey: "invalid_content", + config.ResourceQOSConfigKey: "invalid_content", config.CPUBurstConfigKey: "invalid_content", }, }}, diff --git a/pkg/slo-controller/nodeslo/nodeslo_controller.go b/pkg/slo-controller/nodeslo/nodeslo_controller.go index 3bd7cbedb..19a2ee9bb 100644 --- a/pkg/slo-controller/nodeslo/nodeslo_controller.go +++ b/pkg/slo-controller/nodeslo/nodeslo_controller.go @@ -74,7 +74,7 @@ func (r *NodeSLOReconciler) getNodeSLOSpec(node *corev1.Node, oldSpec *slov1alph } // resourceQOS spec - nodeSLOSpec.ResourceQoSStrategy, err = getResourceQoSSpec(node, &sloCfg.ResourceQoSCfgMerged) + nodeSLOSpec.ResourceQOSStrategy, err = getResourceQOSSpec(node, &sloCfg.ResourceQOSCfgMerged) if err != nil { klog.Warningf("getNodeSLOSpec(): failed to get resourceQoS spec for node %s,error: %v", node.Name, err) } diff --git a/pkg/slo-controller/nodeslo/nodeslo_controller_test.go b/pkg/slo-controller/nodeslo/nodeslo_controller_test.go index 2c9f94b82..9d683c3fe 100644 --- a/pkg/slo-controller/nodeslo/nodeslo_controller_test.go +++ b/pkg/slo-controller/nodeslo/nodeslo_controller_test.go @@ -40,19 +40,19 @@ import ( func TestNodeSLOReconciler_initNodeSLO(t *testing.T) { testingResourceThresholdStrategy := util.DefaultResourceThresholdStrategy() testingResourceThresholdStrategy.CPUSuppressThresholdPercent = pointer.Int64Ptr(60) - testingResourceQoSStrategyOld := &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + testingResourceQOSStrategyOld := &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, }, } - testingResourceQoSStrategy := &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + testingResourceQOSStrategy := &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, @@ -81,7 +81,7 @@ func TestNodeSLOReconciler_initNodeSLO(t *testing.T) { fields: fields{}, want: &slov1alpha1.NodeSLOSpec{ ResourceUsedThresholdWithBE: util.DefaultResourceThresholdStrategy(), - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{}, + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{}, CPUBurstStrategy: util.DefaultCPUBurstStrategy(), }, wantErr: false, @@ -103,12 +103,12 @@ func TestNodeSLOReconciler_initNodeSLO(t *testing.T) { }, Data: map[string]string{ config.ResourceThresholdConfigKey: "{\"clusterStrategy\":{\"invalidField\",\"cpuSuppressThresholdPercent\":60}}", - config.ResourceQoSConfigKey: "{\"clusterStrategy\":{\"invalidField\"}}", + config.ResourceQOSConfigKey: "{\"clusterStrategy\":{\"invalidField\"}}", }, }}, want: &slov1alpha1.NodeSLOSpec{ ResourceUsedThresholdWithBE: util.DefaultResourceThresholdStrategy(), - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{}, + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{}, CPUBurstStrategy: util.DefaultCPUBurstStrategy(), }, wantErr: false, @@ -134,7 +134,7 @@ func TestNodeSLOReconciler_initNodeSLO(t *testing.T) { }}, want: &slov1alpha1.NodeSLOSpec{ ResourceUsedThresholdWithBE: testingResourceThresholdStrategy, - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{}, + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{}, CPUBurstStrategy: util.DefaultCPUBurstStrategy(), }, wantErr: false, @@ -156,11 +156,11 @@ func TestNodeSLOReconciler_initNodeSLO(t *testing.T) { }, Data: map[string]string{ config.ResourceThresholdConfigKey: "{\"clusterStrategy\":{\"enable\":false,\"cpuSuppressThresholdPercent\":60}}", - config.ResourceQoSConfigKey: ` + config.ResourceQOSConfigKey: ` { "clusterStrategy": { - "be": { - "cpuQoS": { + "beClass": { + "cpuQOS": { "groupIdentity": 0 } } @@ -171,7 +171,7 @@ func TestNodeSLOReconciler_initNodeSLO(t *testing.T) { }}, want: &slov1alpha1.NodeSLOSpec{ ResourceUsedThresholdWithBE: testingResourceThresholdStrategy, - ResourceQoSStrategy: testingResourceQoSStrategy, + ResourceQOSStrategy: testingResourceQOSStrategy, CPUBurstStrategy: util.DefaultCPUBurstStrategy(), }, wantErr: false, @@ -193,11 +193,11 @@ func TestNodeSLOReconciler_initNodeSLO(t *testing.T) { }, Data: map[string]string{ config.ResourceThresholdConfigKey: "{\"clusterStrategy\":{\"enable\":false,\"cpuSuppressThresholdPercent\":60}}", - config.ResourceQoSConfigKey: ` + config.ResourceQOSConfigKey: ` { "clusterStrategy": { - "be": { - "cpuQoS": { + "beClass": { + "cpuQOS": { "groupIdentity": 0 } } @@ -208,7 +208,7 @@ func TestNodeSLOReconciler_initNodeSLO(t *testing.T) { }}, want: &slov1alpha1.NodeSLOSpec{ ResourceUsedThresholdWithBE: testingResourceThresholdStrategy, - ResourceQoSStrategy: testingResourceQoSStrategyOld, + ResourceQOSStrategy: testingResourceQOSStrategyOld, CPUBurstStrategy: util.DefaultCPUBurstStrategy(), }, wantErr: false, @@ -263,11 +263,11 @@ func TestNodeSLOReconciler_Reconcile(t *testing.T) { }, Data: map[string]string{ config.ResourceThresholdConfigKey: "{\"clusterStrategy\":{\"enable\":true,\"cpuSuppressThresholdPercent\":60}}", - config.ResourceQoSConfigKey: ` + config.ResourceQOSConfigKey: ` { "clusterStrategy": { - "be": { - "cpuQoS": { + "beClass": { + "cpuQOS": { "groupIdentity": 0 } } @@ -280,10 +280,10 @@ func TestNodeSLOReconciler_Reconcile(t *testing.T) { testingResourceThresholdStrategy := util.DefaultResourceThresholdStrategy() testingResourceThresholdStrategy.Enable = pointer.BoolPtr(true) testingResourceThresholdStrategy.CPUSuppressThresholdPercent = pointer.Int64Ptr(60) - testingResourceQoSStrategy := &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + testingResourceQOSStrategy := &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, @@ -295,7 +295,7 @@ func TestNodeSLOReconciler_Reconcile(t *testing.T) { nodeSLOSpec := &slov1alpha1.NodeSLOSpec{ ResourceUsedThresholdWithBE: testingResourceThresholdStrategy, - ResourceQoSStrategy: testingResourceQoSStrategy, + ResourceQOSStrategy: testingResourceQOSStrategy, CPUBurstStrategy: testingCPUBurstStrategy, } nodeReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: testingNode.Name}} diff --git a/pkg/slo-controller/nodeslo/resource_strategy.go b/pkg/slo-controller/nodeslo/resource_strategy.go index c227d46d0..af3fecb0a 100644 --- a/pkg/slo-controller/nodeslo/resource_strategy.go +++ b/pkg/slo-controller/nodeslo/resource_strategy.go @@ -46,7 +46,7 @@ func getResourceThresholdSpec(node *corev1.Node, cfg *config.ResourceThresholdCf return cfg.ClusterStrategy.DeepCopy(), nil } -func getResourceQoSSpec(node *corev1.Node, cfg *config.ResourceQoSCfg) (*slov1alpha1.ResourceQoSStrategy, error) { +func getResourceQOSSpec(node *corev1.Node, cfg *config.ResourceQOSCfg) (*slov1alpha1.ResourceQOSStrategy, error) { nodeLabels := labels.Set(node.Labels) for _, nodeStrategy := range cfg.NodeStrategies { selector, err := metav1.LabelSelectorAsSelector(nodeStrategy.NodeSelector) @@ -55,7 +55,7 @@ func getResourceQoSSpec(node *corev1.Node, cfg *config.ResourceQoSCfg) (*slov1al continue } if selector.Matches(nodeLabels) { - return nodeStrategy.ResourceQoSStrategy.DeepCopy(), nil + return nodeStrategy.ResourceQOSStrategy.DeepCopy(), nil } } @@ -114,37 +114,37 @@ func caculateResourceThresholdCfgMerged(oldCfg config.ResourceThresholdCfg, conf return mergedCfg, nil } -func caculateResourceQoSCfgMerged(oldCfg config.ResourceQoSCfg, configMap *corev1.ConfigMap) (config.ResourceQoSCfg, error) { - cfgStr, ok := configMap.Data[config.ResourceQoSConfigKey] +func caculateResourceQOSCfgMerged(oldCfg config.ResourceQOSCfg, configMap *corev1.ConfigMap) (config.ResourceQOSCfg, error) { + cfgStr, ok := configMap.Data[config.ResourceQOSConfigKey] if !ok { - return DefaultSLOCfg().ResourceQoSCfgMerged, nil + return DefaultSLOCfg().ResourceQOSCfgMerged, nil } - mergedCfg := DefaultSLOCfg().ResourceQoSCfgMerged + mergedCfg := DefaultSLOCfg().ResourceQOSCfgMerged if err := json.Unmarshal([]byte(cfgStr), &mergedCfg); err != nil { - klog.Errorf("failed to unmarshal config %s, err: %s", config.ResourceQoSConfigKey, err) + klog.Errorf("failed to unmarshal config %s, err: %s", config.ResourceQOSConfigKey, err) return oldCfg, err } // merge ClusterStrategy - clusterMerged := DefaultSLOCfg().ResourceQoSCfgMerged.ClusterStrategy.DeepCopy() + clusterMerged := DefaultSLOCfg().ResourceQOSCfgMerged.ClusterStrategy.DeepCopy() if mergedCfg.ClusterStrategy != nil { mergedStrategyInterface, _ := util.MergeCfg(clusterMerged, mergedCfg.ClusterStrategy) - clusterMerged = mergedStrategyInterface.(*slov1alpha1.ResourceQoSStrategy) + clusterMerged = mergedStrategyInterface.(*slov1alpha1.ResourceQOSStrategy) } mergedCfg.ClusterStrategy = clusterMerged for index, nodeStrategy := range mergedCfg.NodeStrategies { // merge with clusterStrategy - var mergedNodeStrategy *slov1alpha1.ResourceQoSStrategy + var mergedNodeStrategy *slov1alpha1.ResourceQOSStrategy clusterCfgCopy := mergedCfg.ClusterStrategy.DeepCopy() - if nodeStrategy.ResourceQoSStrategy != nil { - mergedStrategyInterface, _ := util.MergeCfg(clusterCfgCopy, nodeStrategy.ResourceQoSStrategy) - mergedNodeStrategy = mergedStrategyInterface.(*slov1alpha1.ResourceQoSStrategy) + if nodeStrategy.ResourceQOSStrategy != nil { + mergedStrategyInterface, _ := util.MergeCfg(clusterCfgCopy, nodeStrategy.ResourceQOSStrategy) + mergedNodeStrategy = mergedStrategyInterface.(*slov1alpha1.ResourceQOSStrategy) } else { mergedNodeStrategy = clusterCfgCopy } - mergedCfg.NodeStrategies[index].ResourceQoSStrategy = mergedNodeStrategy + mergedCfg.NodeStrategies[index].ResourceQOSStrategy = mergedNodeStrategy } diff --git a/pkg/slo-controller/nodeslo/resource_strategy_test.go b/pkg/slo-controller/nodeslo/resource_strategy_test.go index 6a7f60109..7ca9c70dc 100644 --- a/pkg/slo-controller/nodeslo/resource_strategy_test.go +++ b/pkg/slo-controller/nodeslo/resource_strategy_test.go @@ -243,40 +243,40 @@ func Test_caculateResourceThresholdCfgMerged(t *testing.T) { } } -func Test_getResourceQoSSpec(t *testing.T) { +func Test_getResourceQOSSpec(t *testing.T) { defaultSLOCfg := DefaultSLOCfg() - testingResourceQoSCfg := &config.ResourceQoSCfg{ - ClusterStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + testingResourceQOSCfg := &config.ResourceQOSCfg{ + ClusterStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, }, }, } - testingResourceQoSCfg1 := &config.ResourceQoSCfg{ - ClusterStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + testingResourceQOSCfg1 := &config.ResourceQOSCfg{ + ClusterStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, }, }, - NodeStrategies: []config.NodeResourceQoSStrategy{ + NodeStrategies: []config.NodeResourceQOSStrategy{ { NodeSelector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "xxx": "yyy", }, }, - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(1), }, }, @@ -289,10 +289,10 @@ func Test_getResourceQoSSpec(t *testing.T) { "zzz": "zzz", }, }, - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(2), }, }, @@ -303,21 +303,21 @@ func Test_getResourceQoSSpec(t *testing.T) { } type args struct { node *corev1.Node - cfg *config.ResourceQoSCfg + cfg *config.ResourceQOSCfg } tests := []struct { name string args args - want *slov1alpha1.ResourceQoSStrategy + want *slov1alpha1.ResourceQOSStrategy wantErr bool }{ { name: "node empty, use cluster config", args: args{ node: &corev1.Node{}, - cfg: &defaultSLOCfg.ResourceQoSCfgMerged, + cfg: &defaultSLOCfg.ResourceQOSCfgMerged, }, - want: &slov1alpha1.ResourceQoSStrategy{}, + want: &slov1alpha1.ResourceQOSStrategy{}, wantErr: false, }, { @@ -328,9 +328,9 @@ func Test_getResourceQoSSpec(t *testing.T) { Name: "test-node", }, }, - cfg: testingResourceQoSCfg, + cfg: testingResourceQOSCfg, }, - want: testingResourceQoSCfg.ClusterStrategy, + want: testingResourceQOSCfg.ClusterStrategy, }, { name: "get node config correctly", @@ -343,9 +343,9 @@ func Test_getResourceQoSSpec(t *testing.T) { }, }, }, - cfg: testingResourceQoSCfg1, + cfg: testingResourceQOSCfg1, }, - want: testingResourceQoSCfg1.NodeStrategies[1].ResourceQoSStrategy, + want: testingResourceQOSCfg1.NodeStrategies[1].ResourceQOSStrategy, }, { name: "get firstly-matched node config", @@ -358,32 +358,32 @@ func Test_getResourceQoSSpec(t *testing.T) { }, }, }, - cfg: testingResourceQoSCfg1, + cfg: testingResourceQOSCfg1, }, - want: testingResourceQoSCfg1.NodeStrategies[0].ResourceQoSStrategy, + want: testingResourceQOSCfg1.NodeStrategies[0].ResourceQOSStrategy, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, gotErr := getResourceQoSSpec(tt.args.node, tt.args.cfg) + got, gotErr := getResourceQOSSpec(tt.args.node, tt.args.cfg) assert.Equal(t, tt.wantErr, gotErr != nil) assert.Equal(t, tt.want, got) }) } } -func Test_caculateResourceQoSCfgMerged(t *testing.T) { - defaultSLOCfg := DefaultSLOCfg().ResourceQoSCfgMerged - oldSLOConfig := &config.ResourceQoSCfg{ - ClusterStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ +func Test_caculateResourceQOSCfgMerged(t *testing.T) { + defaultSLOCfg := DefaultSLOCfg().ResourceQOSCfgMerged + oldSLOConfig := &config.ResourceQOSCfg{ + ClusterStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(2), }, }, - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ - MemoryQoS: slov1alpha1.MemoryQoS{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ + MemoryQOS: slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(40), }, }, @@ -391,11 +391,11 @@ func Test_caculateResourceQoSCfgMerged(t *testing.T) { }, } - testingOnlyCluster := &config.ResourceQoSCfg{ - ClusterStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + testingOnlyCluster := &config.ResourceQOSCfg{ + ClusterStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, @@ -405,27 +405,27 @@ func Test_caculateResourceQoSCfgMerged(t *testing.T) { testingOnlyClusterStr, _ := json.Marshal(testingOnlyCluster) expectTestingOnlyCluster := testingOnlyCluster.DeepCopy() - testingResourceQoSCfg1 := &config.ResourceQoSCfg{ - ClusterStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + testingResourceQOSCfg1 := &config.ResourceQOSCfg{ + ClusterStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, }, }, - NodeStrategies: []config.NodeResourceQoSStrategy{ + NodeStrategies: []config.NodeResourceQOSStrategy{ { NodeSelector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "xxx": "yyy", }, }, - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(0), }, }, @@ -438,10 +438,10 @@ func Test_caculateResourceQoSCfgMerged(t *testing.T) { "zzz": "zzz", }, }, - ResourceQoSStrategy: &slov1alpha1.ResourceQoSStrategy{ - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ - CPUQoS: slov1alpha1.CPUQoS{ + ResourceQOSStrategy: &slov1alpha1.ResourceQOSStrategy{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ + CPUQOS: slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(-1), }, }, @@ -450,10 +450,10 @@ func Test_caculateResourceQoSCfgMerged(t *testing.T) { }, }, } - testingResourceQoSCfgStr1, _ := json.Marshal(testingResourceQoSCfg1) - expectTestingResourceQoSCfg1 := testingResourceQoSCfg1.DeepCopy() - expectTestingResourceQoSCfg1.NodeStrategies[0].BE.CPUQoS.GroupIdentity = pointer.Int64Ptr(0) - expectTestingResourceQoSCfg1.NodeStrategies[1].BE.CPUQoS.GroupIdentity = pointer.Int64Ptr(-1) + testingResourceQOSCfgStr1, _ := json.Marshal(testingResourceQOSCfg1) + expectTestingResourceQOSCfg1 := testingResourceQOSCfg1.DeepCopy() + expectTestingResourceQOSCfg1.NodeStrategies[0].BEClass.CPUQOS.GroupIdentity = pointer.Int64Ptr(0) + expectTestingResourceQOSCfg1.NodeStrategies[1].BEClass.CPUQOS.GroupIdentity = pointer.Int64Ptr(-1) type args struct { configMap *corev1.ConfigMap @@ -461,7 +461,7 @@ func Test_caculateResourceQoSCfgMerged(t *testing.T) { tests := []struct { name string args args - want *config.ResourceQoSCfg + want *config.ResourceQOSCfg wantErr bool }{ { @@ -477,7 +477,7 @@ func Test_caculateResourceQoSCfgMerged(t *testing.T) { args: args{ configMap: &corev1.ConfigMap{ Data: map[string]string{ - config.ResourceQoSConfigKey: "invalid_content", + config.ResourceQOSConfigKey: "invalid_content", }, }, }, @@ -493,7 +493,7 @@ func Test_caculateResourceQoSCfgMerged(t *testing.T) { Namespace: config.ConfigNameSpace, }, Data: map[string]string{ - config.ResourceQoSConfigKey: string(testingOnlyClusterStr), + config.ResourceQOSConfigKey: string(testingOnlyClusterStr), }, }, }, @@ -508,16 +508,16 @@ func Test_caculateResourceQoSCfgMerged(t *testing.T) { Namespace: config.ConfigNameSpace, }, Data: map[string]string{ - config.ResourceQoSConfigKey: string(testingResourceQoSCfgStr1), + config.ResourceQOSConfigKey: string(testingResourceQOSCfgStr1), }, }, }, - want: expectTestingResourceQoSCfg1, + want: expectTestingResourceQOSCfg1, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, gotErr := caculateResourceQoSCfgMerged(*oldSLOConfig, tt.args.configMap) + got, gotErr := caculateResourceQOSCfgMerged(*oldSLOConfig, tt.args.configMap) assert.Equal(t, tt.wantErr, gotErr != nil) assert.Equal(t, tt.want, &got) }) diff --git a/pkg/util/config.go b/pkg/util/config.go index 8247bb3c3..7a5e8797d 100644 --- a/pkg/util/config.go +++ b/pkg/util/config.go @@ -28,7 +28,7 @@ import ( func DefaultNodeSLOSpecConfig() slov1alpha1.NodeSLOSpec { return slov1alpha1.NodeSLOSpec{ ResourceUsedThresholdWithBE: DefaultResourceThresholdStrategy(), - ResourceQoSStrategy: DefaultResourceQoSStrategy(), + ResourceQOSStrategy: DefaultResourceQOSStrategy(), CPUBurstStrategy: DefaultCPUBurstStrategy(), } } @@ -42,45 +42,45 @@ func DefaultResourceThresholdStrategy() *slov1alpha1.ResourceThresholdStrategy { } } -func DefaultCPUQoS(qos apiext.QoSClass) *slov1alpha1.CPUQoS { - var cpuQoS *slov1alpha1.CPUQoS +func DefaultCPUQOS(qos apiext.QoSClass) *slov1alpha1.CPUQOS { + var cpuQOS *slov1alpha1.CPUQOS switch qos { case apiext.QoSLSR: - cpuQoS = &slov1alpha1.CPUQoS{ + cpuQOS = &slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(2), } case apiext.QoSLS: - cpuQoS = &slov1alpha1.CPUQoS{ + cpuQOS = &slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(2), } case apiext.QoSBE: - cpuQoS = &slov1alpha1.CPUQoS{ + cpuQOS = &slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64Ptr(-1), } default: klog.Infof("cpu qos has no auto config for qos %s", qos) } - return cpuQoS + return cpuQOS } // TODO https://github.com/koordinator-sh/koordinator/pull/94#discussion_r858786733 -func DefaultResctrlQoS(qos apiext.QoSClass) *slov1alpha1.ResctrlQoS { - var resctrlQoS *slov1alpha1.ResctrlQoS +func DefaultResctrlQOS(qos apiext.QoSClass) *slov1alpha1.ResctrlQOS { + var resctrlQOS *slov1alpha1.ResctrlQOS switch qos { case apiext.QoSLSR: - resctrlQoS = &slov1alpha1.ResctrlQoS{ + resctrlQOS = &slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), MBAPercent: pointer.Int64Ptr(100), } case apiext.QoSLS: - resctrlQoS = &slov1alpha1.ResctrlQoS{ + resctrlQOS = &slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), MBAPercent: pointer.Int64Ptr(100), } case apiext.QoSBE: - resctrlQoS = &slov1alpha1.ResctrlQoS{ + resctrlQOS = &slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(30), MBAPercent: pointer.Int64Ptr(100), @@ -88,10 +88,10 @@ func DefaultResctrlQoS(qos apiext.QoSClass) *slov1alpha1.ResctrlQoS { default: klog.Infof("resctrl qos has no auto config for qos %s", qos) } - return resctrlQoS + return resctrlQOS } -// DefaultMemoryQoS returns the recommended configuration for memory qos strategy. +// DefaultMemoryQOS returns the recommended configuration for memory qos strategy. // Please refer to `apis/slo/v1alpha1` for the definition of each field. // In the recommended configuration, all abilities of memcg qos are disable, including `MinLimitPercent`, // `LowLimitPercent`, `ThrottlingPercent` since they are not fully beneficial to all scenarios. Whereas, they are still @@ -101,11 +101,11 @@ func DefaultResctrlQoS(qos apiext.QoSClass) *slov1alpha1.ResctrlQoS { // the more excess reclamations. // Memory min watermark grading corresponding to `WmarkMinAdj` is enabled. It benefits high-priority pods by postponing // global reclaim when machine's free memory is below than `/proc/sys/vm/min_free_kbytes`. -func DefaultMemoryQoS(qos apiext.QoSClass) *slov1alpha1.MemoryQoS { - var memoryQoS *slov1alpha1.MemoryQoS +func DefaultMemoryQOS(qos apiext.QoSClass) *slov1alpha1.MemoryQOS { + var memoryQOS *slov1alpha1.MemoryQOS switch qos { case apiext.QoSLSR: - memoryQoS = &slov1alpha1.MemoryQoS{ + memoryQOS = &slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -117,7 +117,7 @@ func DefaultMemoryQoS(qos apiext.QoSClass) *slov1alpha1.MemoryQoS { OomKillGroup: pointer.Int64Ptr(0), } case apiext.QoSLS: - memoryQoS = &slov1alpha1.MemoryQoS{ + memoryQOS = &slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -129,7 +129,7 @@ func DefaultMemoryQoS(qos apiext.QoSClass) *slov1alpha1.MemoryQoS { OomKillGroup: pointer.Int64Ptr(0), } case apiext.QoSBE: - memoryQoS = &slov1alpha1.MemoryQoS{ + memoryQOS = &slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -143,90 +143,90 @@ func DefaultMemoryQoS(qos apiext.QoSClass) *slov1alpha1.MemoryQoS { default: klog.V(5).Infof("memory qos has no auto config for qos %s", qos) } - return memoryQoS + return memoryQOS } -func DefaultResourceQoSStrategy() *slov1alpha1.ResourceQoSStrategy { - return &slov1alpha1.ResourceQoSStrategy{ - LSR: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ +func DefaultResourceQOSStrategy() *slov1alpha1.ResourceQOSStrategy { + return &slov1alpha1.ResourceQOSStrategy{ + LSRClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.BoolPtr(false), - CPUQoS: *DefaultCPUQoS(apiext.QoSLSR), + CPUQOS: *DefaultCPUQOS(apiext.QoSLSR), }, - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ Enable: pointer.BoolPtr(false), - ResctrlQoS: *DefaultResctrlQoS(apiext.QoSLSR), + ResctrlQOS: *DefaultResctrlQOS(apiext.QoSLSR), }, - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(false), - MemoryQoS: *DefaultMemoryQoS(apiext.QoSLSR), + MemoryQOS: *DefaultMemoryQOS(apiext.QoSLSR), }, }, - LS: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + LSClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.BoolPtr(false), - CPUQoS: *DefaultCPUQoS(apiext.QoSLS), + CPUQOS: *DefaultCPUQOS(apiext.QoSLS), }, - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ Enable: pointer.BoolPtr(false), - ResctrlQoS: *DefaultResctrlQoS(apiext.QoSLS), + ResctrlQOS: *DefaultResctrlQOS(apiext.QoSLS), }, - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(false), - MemoryQoS: *DefaultMemoryQoS(apiext.QoSLS), + MemoryQOS: *DefaultMemoryQOS(apiext.QoSLS), }, }, - BE: &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ + BEClass: &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.BoolPtr(false), - CPUQoS: *DefaultCPUQoS(apiext.QoSBE), + CPUQOS: *DefaultCPUQOS(apiext.QoSBE), }, - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ Enable: pointer.BoolPtr(false), - ResctrlQoS: *DefaultResctrlQoS(apiext.QoSBE), + ResctrlQOS: *DefaultResctrlQOS(apiext.QoSBE), }, - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(false), - MemoryQoS: *DefaultMemoryQoS(apiext.QoSBE), + MemoryQOS: *DefaultMemoryQOS(apiext.QoSBE), }, }, } } -func NoneResourceQoS(qos apiext.QoSClass) *slov1alpha1.ResourceQoS { - return &slov1alpha1.ResourceQoS{ - CPUQoS: &slov1alpha1.CPUQoSCfg{ +func NoneResourceQOS(qos apiext.QoSClass) *slov1alpha1.ResourceQOS { + return &slov1alpha1.ResourceQOS{ + CPUQOS: &slov1alpha1.CPUQOSCfg{ Enable: pointer.BoolPtr(false), - CPUQoS: *NoneCPUQoS(), + CPUQOS: *NoneCPUQOS(), }, - ResctrlQoS: &slov1alpha1.ResctrlQoSCfg{ + ResctrlQOS: &slov1alpha1.ResctrlQOSCfg{ Enable: pointer.BoolPtr(false), - ResctrlQoS: *NoneResctrlQoS(), + ResctrlQOS: *NoneResctrlQOS(), }, - MemoryQoS: &slov1alpha1.MemoryQoSCfg{ + MemoryQOS: &slov1alpha1.MemoryQOSCfg{ Enable: pointer.BoolPtr(false), - MemoryQoS: *NoneMemoryQoS(), + MemoryQOS: *NoneMemoryQOS(), }, } } -func NoneCPUQoS() *slov1alpha1.CPUQoS { - return &slov1alpha1.CPUQoS{ +func NoneCPUQOS() *slov1alpha1.CPUQOS { + return &slov1alpha1.CPUQOS{ GroupIdentity: pointer.Int64(0), } } -func NoneResctrlQoS() *slov1alpha1.ResctrlQoS { - return &slov1alpha1.ResctrlQoS{ +func NoneResctrlQOS() *slov1alpha1.ResctrlQOS { + return &slov1alpha1.ResctrlQOS{ CATRangeStartPercent: pointer.Int64Ptr(0), CATRangeEndPercent: pointer.Int64Ptr(100), MBAPercent: pointer.Int64Ptr(100), } } -// NoneMemoryQoS returns the all-disabled configuration for memory qos strategy. -func NoneMemoryQoS() *slov1alpha1.MemoryQoS { - return &slov1alpha1.MemoryQoS{ +// NoneMemoryQOS returns the all-disabled configuration for memory qos strategy. +func NoneMemoryQOS() *slov1alpha1.MemoryQOS { + return &slov1alpha1.MemoryQOS{ MinLimitPercent: pointer.Int64Ptr(0), LowLimitPercent: pointer.Int64Ptr(0), ThrottlingPercent: pointer.Int64Ptr(0), @@ -239,12 +239,12 @@ func NoneMemoryQoS() *slov1alpha1.MemoryQoS { } } -// NoneResourceQoSStrategy indicates the qos strategy with all qos -func NoneResourceQoSStrategy() *slov1alpha1.ResourceQoSStrategy { - return &slov1alpha1.ResourceQoSStrategy{ - LSR: NoneResourceQoS(apiext.QoSLSR), - LS: NoneResourceQoS(apiext.QoSLS), - BE: NoneResourceQoS(apiext.QoSBE), +// NoneResourceQOSStrategy indicates the qos strategy with all qos +func NoneResourceQOSStrategy() *slov1alpha1.ResourceQOSStrategy { + return &slov1alpha1.ResourceQOSStrategy{ + LSRClass: NoneResourceQOS(apiext.QoSLSR), + LSClass: NoneResourceQOS(apiext.QoSLS), + BEClass: NoneResourceQOS(apiext.QoSBE), } } diff --git a/pkg/util/container.go b/pkg/util/container.go index deb1d4063..cc7a86a1d 100644 --- a/pkg/util/container.go +++ b/pkg/util/container.go @@ -48,6 +48,16 @@ func GetContainerCgroupPathWithKubeByID(podParentDir string, containerID string) ), nil } +// @parentDir kubepods-burstable.slice/kubepods-pod7712555c_ce62_454a_9e18_9ff0217b8941.slice/ +// @return /sys/fs/cgroup/cpu/kubepods.slice/kubepods-pod7712555c_ce62_454a_9e18_9ff0217b8941.slice/cgroup.procs +func GetContainerCgroupCPUProcsPath(podParentDir string, c *corev1.ContainerStatus) (string, error) { + containerPath, err := GetContainerCgroupPathWithKube(podParentDir, c) + if err != nil { + return "", err + } + return system.GetCgroupFilePath(containerPath, system.CPUProcs), nil +} + func GetContainerCgroupCPUAcctUsagePath(podParentDir string, c *corev1.ContainerStatus) (string, error) { containerPath, err := GetContainerCgroupPathWithKube(podParentDir, c) if err != nil { @@ -219,6 +229,40 @@ func GetContainerCurTasks(podParentDir string, c *corev1.ContainerStatus) ([]int return system.GetCgroupCurTasks(cgroupPath) } +func GetPIDsInPod(podParentDir string, cs []corev1.ContainerStatus) ([]uint64, error) { + pids := make([]uint64, 0) + for i := range cs { + p, err := GetPIDsInContainer(podParentDir, &cs[i]) + if err != nil { + return nil, err + } + pids = append(pids, p...) + } + return pids, nil +} + +func GetPIDsInContainer(podParentDir string, c *corev1.ContainerStatus) ([]uint64, error) { + cgroupPath, err := GetContainerCgroupCPUProcsPath(podParentDir, c) + if err != nil { + return nil, err + } + rawContent, err := ioutil.ReadFile(cgroupPath) + if err != nil { + return nil, err + } + pidStrs := strings.Fields(strings.TrimSpace(string(rawContent))) + pids := make([]uint64, len(pidStrs)) + + for i := 0; i < len(pids); i++ { + p, err := strconv.ParseUint(pidStrs[i], 10, 64) + if err != nil { + return nil, err + } + pids[i] = p + } + return pids, nil +} + func FindContainerIdAndStatusByName(status *corev1.PodStatus, name string) (string, *corev1.ContainerStatus, error) { allStatuses := status.InitContainerStatuses allStatuses = append(allStatuses, status.ContainerStatuses...) diff --git a/pkg/util/container_test.go b/pkg/util/container_test.go index 0dd31d7e0..c12b2817f 100644 --- a/pkg/util/container_test.go +++ b/pkg/util/container_test.go @@ -20,15 +20,16 @@ import ( "fmt" "io/ioutil" "os" + "path" "path/filepath" + "reflect" "testing" + "github.com/koordinator-sh/koordinator/apis/extension" + "github.com/koordinator-sh/koordinator/pkg/util/system" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" - - "github.com/koordinator-sh/koordinator/apis/extension" - "github.com/koordinator-sh/koordinator/pkg/util/system" ) func Test_getContainerCgroupPathWithSystemdDriver(t *testing.T) { @@ -547,6 +548,26 @@ func Test_GetContainerCgroupXXXPath(t *testing.T) { expectPath: "", expectErr: true, }, + { + name: "test_cpu_procs_path", + fn: GetContainerCgroupCPUProcsPath, + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + containerStatus: &corev1.ContainerStatus{ + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf", + }, + expectPath: "/host-cgroup/cpu/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/docker-703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf.scope/cgroup.procs", + expectErr: false, + }, + { + name: "test_cpu_procs_path_invalid", + fn: GetContainerCgroupCPUProcsPath, + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + containerStatus: &corev1.ContainerStatus{ + ContainerID: "703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf", + }, + expectPath: "", + expectErr: true, + }, } for _, tt := range tests { @@ -691,3 +712,127 @@ func Test_GetContainerXXXValue(t *testing.T) { }) } } + +func TestGetPIDsInContainer(t *testing.T) { + system.SetupCgroupPathFormatter(system.Systemd) + defer system.SetupCgroupPathFormatter(system.Systemd) + type args struct { + podParentDir string + c *corev1.ContainerStatus + } + dir := t.TempDir() + system.Conf.CgroupRootDir = dir + + podCgroupPath := "/cpu/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/docker-703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf.scope/cgroup.procs" + sysCgroupPath := path.Join(dir, podCgroupPath) + if err := writeCgroupContent(sysCgroupPath, []byte("12\n23")); err != nil { + t.Fatal(err) + } + + tests := []struct { + name string + args args + want []uint64 + wantErr bool + }{ + { + name: "cgroup", + args: args{ + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + c: &corev1.ContainerStatus{ + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf", + }, + }, + want: []uint64{12, 23}, + }, + { + name: "not exist", + args: args{ + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + c: &corev1.ContainerStatus{ + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4assf", + }, + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := GetPIDsInContainer(tt.args.podParentDir, tt.args.c) + if (err != nil) != tt.wantErr { + t.Errorf("GetPIDsInContainer() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("GetPIDsInContainer() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestGetPIDsInPod(t *testing.T) { + + system.SetupCgroupPathFormatter(system.Systemd) + defer system.SetupCgroupPathFormatter(system.Systemd) + dir := t.TempDir() + system.Conf.CgroupRootDir = dir + + p1 := "/cpu/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/docker-703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf.scope/cgroup.procs" + p1CgroupPath := path.Join(dir, p1) + if err := writeCgroupContent(p1CgroupPath, []byte("12\n23")); err != nil { + t.Fatal(err) + } + + p2 := "/cpu/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/docker-703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4acff.scope/cgroup.procs" + p2CgroupPath := path.Join(dir, p2) + if err := writeCgroupContent(p2CgroupPath, []byte("45\n67")); err != nil { + t.Fatal(err) + } + type args struct { + podParentDir string + cs []corev1.ContainerStatus + } + tests := []struct { + name string + args args + want []uint64 + wantErr bool + }{ + { + name: "cgroup", + args: args{ + podParentDir: "kubepods-besteffort.slice/kubepods-besteffort-pod6553a60b_2b97_442a_b6da_a5704d81dd98.slice/", + cs: []corev1.ContainerStatus{ + { + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4abaf", + }, + { + ContainerID: "docker://703b1b4e811f56673d68f9531204e5dd4963e734e2929a7056fd5f33fde4acff", + }, + }, + }, + want: []uint64{12, 23, 45, 67}, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := GetPIDsInPod(tt.args.podParentDir, tt.args.cs) + if (err != nil) != tt.wantErr { + t.Errorf("GetPIDsInPod() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("GetPIDsInPod() = %v, want %v", got, tt.want) + } + }) + } +} + +func writeCgroupContent(filePath string, content []byte) error { + err := os.MkdirAll(path.Dir(filePath), os.ModePerm) + if err != nil { + return err + } + return ioutil.WriteFile(filePath, content, 0655) +} diff --git a/pkg/util/feature.go b/pkg/util/feature.go index bd7599fda..1c6162803 100644 --- a/pkg/util/feature.go +++ b/pkg/util/feature.go @@ -30,18 +30,17 @@ import ( // RunFeature runs moduleFunc only if interval > 0 AND at least one feature dependency is enabled func RunFeature(moduleFunc func(), featureDependency []featuregate.Feature, interval int, stopCh <-chan struct{}) bool { - ret, _ := RunFeatureWithInit(func() error { return nil }, moduleFunc, featureDependency, interval, stopCh) - return ret + return RunFeatureWithInit(func() error { return nil }, moduleFunc, featureDependency, interval, stopCh) } // RunFeatureWithInit runs moduleFunc only if interval > 0 , at least one feature dependency is enabled // and moduleInit function returns nil -func RunFeatureWithInit(moduleInit func() error, moduleFunc func(), featureDependency []featuregate.Feature, interval int, stopCh <-chan struct{}) (bool, error) { +func RunFeatureWithInit(moduleInit func() error, moduleFunc func(), featureDependency []featuregate.Feature, interval int, stopCh <-chan struct{}) bool { moduleInitName := runtime.FuncForPC(reflect.ValueOf(moduleInit).Pointer()).Name() moduleFuncName := runtime.FuncForPC(reflect.ValueOf(moduleFunc).Pointer()).Name() if interval <= 0 { klog.Infof("time interval %v is disabled, skip run %v module", interval, moduleFuncName) - return false, nil + return false } moduleFuncEnabled := len(featureDependency) == 0 @@ -53,16 +52,16 @@ func RunFeatureWithInit(moduleInit func() error, moduleFunc func(), featureDepen } if !moduleFuncEnabled { klog.Infof("all feature dependency %v is disabled, skip run module %v", featureDependency, moduleFuncName) - return false, nil + return false } klog.Infof("starting %v feature init module", moduleInitName) if err := moduleInit(); err != nil { klog.Errorf("starting %v feature init module error %v", moduleInitName, err) - return false, err + return false } klog.Infof("starting %v feature dependency module, interval seconds %v", moduleFuncName, interval) go wait.Until(moduleFunc, time.Duration(interval)*time.Second, stopCh) - return true, nil + return true } diff --git a/pkg/util/resource_test.go b/pkg/util/resource_test.go new file mode 100644 index 000000000..7323916dd --- /dev/null +++ b/pkg/util/resource_test.go @@ -0,0 +1,94 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +func TestIsResourceDiff(t *testing.T) { + type args struct { + old corev1.ResourceList + new corev1.ResourceList + resourceName corev1.ResourceName + diffThreshold float64 + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "the new resource has big enough difference with the old one", + args: args{ + old: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), + corev1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI), + }, + new: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(9, resource.DecimalSI), + corev1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI), + }, + resourceName: corev1.ResourceCPU, + diffThreshold: 2, + }, + want: true, + }, + { + name: "the new resource doesn't have big enough difference with the old one", + args: args{ + old: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), + corev1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI), + }, + new: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + corev1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI), + }, + resourceName: corev1.ResourceCPU, + diffThreshold: 2, + }, + want: false, + }, + { + name: "the old resource doesn't have queryed resource type", + args: args{ + old: corev1.ResourceList{ + // corev1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), + corev1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI), + }, + new: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + corev1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI), + }, + resourceName: corev1.ResourceCPU, + diffThreshold: 2, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsResourceDiff(tt.args.old, tt.args.new, tt.args.resourceName, tt.args.diffThreshold); got != tt.want { + t.Errorf("IsResourceDiff() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/util/system/cgroup_resource.go b/pkg/util/system/cgroup_resource.go index 11a27315d..984bd1af8 100644 --- a/pkg/util/system/cgroup_resource.go +++ b/pkg/util/system/cgroup_resource.go @@ -54,6 +54,8 @@ const ( MemHighFileName = "memory.high" MemoryLimitFileName = "memory.limit_in_bytes" MemStatFileName = "memory.stat" + + ProcsFileName = "cgroup.procs" ) var ( @@ -94,6 +96,8 @@ var ( MemMin = CgroupFile{ResourceFileName: MemMinFileName, Subfs: CgroupMemDir, IsAnolisOS: true, Validator: MemMinValidator} MemLow = CgroupFile{ResourceFileName: MemLowFileName, Subfs: CgroupMemDir, IsAnolisOS: true, Validator: MemLowValidator} MemHigh = CgroupFile{ResourceFileName: MemHighFileName, Subfs: CgroupMemDir, IsAnolisOS: true, Validator: MemHighValidator} + + CPUProcs = CgroupFile{ResourceFileName: ProcsFileName, Subfs: CgroupCPUDir, IsAnolisOS: false} ) type CgroupFile struct { diff --git a/pkg/util/system/cgroup_test.go b/pkg/util/system/cgroup_test.go index 758ad85d4..aa14e5c77 100644 --- a/pkg/util/system/cgroup_test.go +++ b/pkg/util/system/cgroup_test.go @@ -23,6 +23,55 @@ import ( "github.com/stretchr/testify/assert" ) +func TestCgroupFileWriteIfDifferent(t *testing.T) { + taskDir := "/" + type args struct { + cgroupTaskDir string + file CgroupFile + value string + currentValue string + } + tests := []struct { + name string + args args + wantErr bool + }{ + { + name: "currentValue is the same as value", + args: args{ + cgroupTaskDir: taskDir, + file: CPUShares, + value: "1024", + currentValue: "1024", + }, + wantErr: false, + }, + { + name: "currentValue is different with value", + args: args{ + cgroupTaskDir: taskDir, + file: CPUShares, + value: "1024", + currentValue: "512", + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := NewFileTestUtil(t) + helper.CreateCgroupFile(taskDir, tt.args.file) + + err := CgroupFileWrite(taskDir, tt.args.file, tt.args.currentValue) + assert.NoError(t, err) + + gotErr := CgroupFileWriteIfDifferent(taskDir, tt.args.file, tt.args.currentValue) + assert.Equal(t, tt.wantErr, gotErr != nil) + + }) + } +} + func TestCgroupFileReadInt(t *testing.T) { taskDir := "/" testingInt64 := int64(1024) diff --git a/pkg/util/system/config.go b/pkg/util/system/config.go index 1a6747341..f2da82498 100644 --- a/pkg/util/system/config.go +++ b/pkg/util/system/config.go @@ -82,16 +82,16 @@ func SetConf(config Config) { } func (c *Config) InitFlags(fs *flag.FlagSet) { - fs.StringVar(&c.CgroupRootDir, "CgroupRootDir", c.CgroupRootDir, "Cgroup root dir") - fs.StringVar(&c.SysFSRootDir, "SysRootDir", c.SysFSRootDir, "host /sys dir in container") - fs.StringVar(&c.SysFSRootDir, "SysFSRootDir", c.SysFSRootDir, "host /sys/fs dir in container, used by resctrl fs") - fs.StringVar(&c.ProcRootDir, "ProcRootDir", c.ProcRootDir, "host /proc dir in container") - fs.StringVar(&c.VarRunRootDir, "VarRunRootDir", c.VarRunRootDir, "host /var/run dir in container") + fs.StringVar(&c.CgroupRootDir, "cgroup-root-dir", c.CgroupRootDir, "Cgroup root dir") + fs.StringVar(&c.SysFSRootDir, "sys-root-dir", c.SysFSRootDir, "host /sys dir in container") + fs.StringVar(&c.SysFSRootDir, "sys-fs-root-dir", c.SysFSRootDir, "host /sys/fs dir in container, used by resctrl fs") + fs.StringVar(&c.ProcRootDir, "proc-root-dir", c.ProcRootDir, "host /proc dir in container") + fs.StringVar(&c.VarRunRootDir, "var-run-root-dir", c.VarRunRootDir, "host /var/run dir in container") - fs.StringVar(&c.CgroupKubePath, "CgroupKubeDir", c.CgroupKubePath, "Cgroup kube dir") + fs.StringVar(&c.CgroupKubePath, "cgroup-kube-dir", c.CgroupKubePath, "Cgroup kube dir") fs.StringVar(&c.NodeNameOverride, "node-name-override", c.NodeNameOverride, "If non-empty, will use this string as identification instead of the actual machine name. ") - fs.StringVar(&c.ContainerdEndPoint, "containerdEndPoint", c.ContainerdEndPoint, "containerd endPoint") - fs.StringVar(&c.DockerEndPoint, "dockerEndPoint", c.DockerEndPoint, "docker endPoint") + fs.StringVar(&c.ContainerdEndPoint, "containerd-endpoint", c.ContainerdEndPoint, "containerd endPoint") + fs.StringVar(&c.DockerEndPoint, "docker-endpoint", c.DockerEndPoint, "docker endPoint") HostSystemInfo = collectVersionInfo() initFilePath() diff --git a/pkg/util/system/config_test.go b/pkg/util/system/config_test.go new file mode 100644 index 000000000..cc9871aa3 --- /dev/null +++ b/pkg/util/system/config_test.go @@ -0,0 +1,56 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package system + +import ( + "flag" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_NewDsModeConfig(t *testing.T) { + expectConfig := &Config{ + CgroupKubePath: "kubepods/", + CgroupRootDir: "/host-cgroup/", + ProcRootDir: "/proc/", + SysRootDir: "/host-sys/", + SysFSRootDir: "/host-sys-fs/", + VarRunRootDir: "/host-var-run/", + } + defaultConfig := NewDsModeConfig() + assert.Equal(t, expectConfig, defaultConfig) +} + +func Test_NewHostModeConfig(t *testing.T) { + expectConfig := &Config{ + CgroupKubePath: "kubepods/", + CgroupRootDir: "/sys/fs/cgroup/", + ProcRootDir: "/proc/", + SysRootDir: "/sys/", + SysFSRootDir: "/sys/fs/", + VarRunRootDir: "/var/run/", + } + defaultConfig := NewHostModeConfig() + assert.Equal(t, expectConfig, defaultConfig) +} + +func Test_InitFlags(t *testing.T) { + cfg := NewDsModeConfig() + cfg.InitFlags(flag.CommandLine) + flag.Parse() +} diff --git a/pkg/util/utils.go b/pkg/util/utils.go index 7b30e5bee..bbc1378fc 100644 --- a/pkg/util/utils.go +++ b/pkg/util/utils.go @@ -19,7 +19,10 @@ package util import ( "encoding/json" "fmt" + "github.com/koordinator-sh/koordinator/apis/extension" "io/ioutil" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "path/filepath" "reflect" "sort" @@ -142,3 +145,40 @@ func MaxInt64(i, j int64) int64 { } return j } + +// GetNamespacedName returns the namespaced name. +func GetNamespacedName(obj metav1.Object) string { + return fmt.Sprintf("%v/%v", obj.GetNamespace(), obj.GetName()) +} + +// GetSubPriority +//Get pod's sub-priority in Koordinator from label +func GetSubPriority(pod *corev1.Pod) (int32, error) { + if pod.Labels != nil { + priority, err := strconv.ParseInt(pod.Labels[extension.LabelPodPriority], 0, 32) + if err != nil { + return 0, err + } + return int32(priority), nil + } + // When pod isn't set with the KoordinatorPriority label, + //We assume that the sub-priority of the pod is default 0 + return 0, nil +} + +// StringToGangGroupSlice +//Parse gang group's annotation like :"[gangA,gangB]" => goLang slice : []string{"gangA"."gangB"} +//todo:need to be more graceful +func StringToGangGroupSlice(s string) ([]string, error) { + defaultSlice := make([]string, 0) + if s == "" { + return defaultSlice, nil + } + length := len(s) + //should start with '[' ,end with ']' + if s[0] != '[' && s[length-1] != ']' { + return defaultSlice, fmt.Errorf("gangGroup info illegal") + } + s = s[1 : length-1] + return strings.Split(s, ","), nil +} diff --git a/pkg/util/utils_test.go b/pkg/util/utils_test.go index 63df4843e..d14dd1ccd 100644 --- a/pkg/util/utils_test.go +++ b/pkg/util/utils_test.go @@ -279,3 +279,91 @@ func Test_UtilCgroupCPUSet(t *testing.T) { assert.NoError(t, err) assert.Equal(t, cpuset, gotCPUSet) } + +func TestMinInt64(t *testing.T) { + type args struct { + i int64 + j int64 + } + tests := []struct { + name string + args args + want int64 + }{ + { + name: "i < j", + args: args{ + i: 0, + j: 1, + }, + want: 0, + }, + { + name: "i > j", + args: args{ + i: 1, + j: 0, + }, + want: 0, + }, + { + name: "i = j", + args: args{ + i: 0, + j: 0, + }, + want: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := MinInt64(tt.args.i, tt.args.j); got != tt.want { + t.Errorf("MinInt64() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestMaxInt64(t *testing.T) { + type args struct { + i int64 + j int64 + } + tests := []struct { + name string + args args + want int64 + }{ + { + name: "i < j", + args: args{ + i: 0, + j: 1, + }, + want: 1, + }, + { + name: "i > j", + args: args{ + i: 1, + j: 0, + }, + want: 1, + }, + { + name: "i = j", + args: args{ + i: 0, + j: 0, + }, + want: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := MaxInt64(tt.args.i, tt.args.j); got != tt.want { + t.Errorf("MaxInt64() = %v, want %v", got, tt.want) + } + }) + } +}