release v1.2.0

Signed-off-by: 佑祎 <zzw261520@alibaba-inc.com>
koordinator-sh · Apr 12, 2023 · e158fb3 · e158fb3
1 parent 15e9a59
commit e158fb3
Show file tree

Hide file tree

Showing 13 changed files with 119 additions and 124 deletions.
diff --git a/client/clientset/versioned/fake/register.go b/client/clientset/versioned/fake/register.go
diff --git a/client/clientset/versioned/scheme/register.go b/client/clientset/versioned/scheme/register.go
diff --git a/extension/constants.go b/extension/constants.go
@@ -20,7 +20,7 @@ import corev1 "k8s.io/api/core/v1"
 
 const (
 	DomainPrefix = "koordinator.sh/"
-	// use prefix "kubernetes.io/" for extend resource
+	// ResourceDomainPrefix is a prefix "kubernetes.io/" used by particular extend resources (e.g. batch resources)
 	ResourceDomainPrefix = corev1.ResourceDefaultNamespacePrefix
 	// SchedulingDomainPrefix represents the scheduling domain prefix
 	SchedulingDomainPrefix = "scheduling.koordinator.sh"

diff --git a/extension/elastic_quota.go b/extension/elastic_quota.go
@@ -27,9 +27,9 @@ import (
 
 // RootQuotaName means quotaTree's root\head.
 const (
-	SystemQuotaName        = "system"
-	RootQuotaName          = "root"
-	DefaultQuotaName       = "default"
+	SystemQuotaName        = "koordinator-system-quota"
+	RootQuotaName          = "koordinator-root-quota"
+	DefaultQuotaName       = "koordinator-default-quota"
 	QuotaKoordinatorPrefix = "quota.scheduling.koordinator.sh"
 	LabelQuotaIsParent     = QuotaKoordinatorPrefix + "/is-parent"
 	LabelQuotaParent       = QuotaKoordinatorPrefix + "/parent"

diff --git a/extension/priority.go b/extension/priority.go
@@ -24,6 +24,7 @@ import (
 
 type PriorityClass string
 
+// https://koordinator.sh/docs/architecture/priority/
 const (
 	PriorityProd  PriorityClass = "koord-prod"
 	PriorityMid   PriorityClass = "koord-mid"

diff --git a/extension/qos.go b/extension/qos.go
@@ -16,10 +16,9 @@ limitations under the License.
 
 package extension
 
-import corev1 "k8s.io/api/core/v1"
-
 type QoSClass string
 
+// https://koordinator.sh/docs/architecture/qos/
 const (
 	QoSLSE    QoSClass = "LSE"
 	QoSLSR    QoSClass = "LSR"
@@ -29,21 +28,6 @@ const (
 	QoSNone   QoSClass = ""
 )
 
-func GetPodQoSClass(pod *corev1.Pod) QoSClass {
-	if pod == nil || pod.Labels == nil {
-		return QoSNone
-	}
-	return GetQoSClassByAttrs(pod.Labels, pod.Annotations)
-}
-
-func GetQoSClassByAttrs(labels, annotations map[string]string) QoSClass {
-	// annotations are for old format adaption reason
-	if q, exist := labels[LabelPodQoS]; exist {
-		return GetPodQoSClassByName(q)
-	}
-	return QoSNone
-}
-
 func GetPodQoSClassByName(qos string) QoSClass {
 	q := QoSClass(qos)
 

diff --git a/extension/resource.go b/extension/resource.go
@@ -20,29 +20,25 @@ import (
 	"encoding/json"
 
 	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
 const (
-	// Deprecated: because of the limitation of extended resource naming
-	KoordBatchCPU corev1.ResourceName = DomainPrefix + "batch-cpu"
-	// Deprecated: because of the limitation of extended resource naming
-	KoordBatchMemory corev1.ResourceName = DomainPrefix + "batch-memory"
-
 	BatchCPU    corev1.ResourceName = ResourceDomainPrefix + "batch-cpu"
 	BatchMemory corev1.ResourceName = ResourceDomainPrefix + "batch-memory"
 
-	KoordRDMA corev1.ResourceName = ResourceDomainPrefix + "rdma"
-	KoordFPGA corev1.ResourceName = ResourceDomainPrefix + "fpga"
-
-	KoordGPU  corev1.ResourceName = ResourceDomainPrefix + "gpu"
-	NvidiaGPU corev1.ResourceName = "nvidia.com/gpu"
-
-	GPUCore        corev1.ResourceName = ResourceDomainPrefix + "gpu-core"
-	GPUMemory      corev1.ResourceName = ResourceDomainPrefix + "gpu-memory"
-	GPUMemoryRatio corev1.ResourceName = ResourceDomainPrefix + "gpu-memory-ratio"
+	ResourceNvidiaGPU      corev1.ResourceName = "nvidia.com/gpu"
+	ResourceRDMA           corev1.ResourceName = DomainPrefix + "rdma"
+	ResourceFPGA           corev1.ResourceName = DomainPrefix + "fpga"
+	ResourceGPU            corev1.ResourceName = DomainPrefix + "gpu"
+	ResourceGPUCore        corev1.ResourceName = DomainPrefix + "gpu-core"
+	ResourceGPUMemory      corev1.ResourceName = DomainPrefix + "gpu-memory"
+	ResourceGPUMemoryRatio corev1.ResourceName = DomainPrefix + "gpu-memory-ratio"
+)
 
-	GPUDriver string = ResourceDomainPrefix + "gpu-driver"
-	GPUModel  string = ResourceDomainPrefix + "gpu-model"
+const (
+	LabelGPUModel         string = NodeDomainPrefix + "/gpu-model"
+	LabelGPUDriverVersion string = NodeDomainPrefix + "/gpu-driver-version"
 )
 
 const (
@@ -160,18 +156,20 @@ func GetResourceStatus(annotations map[string]string) (*ResourceStatus, error) {
 	return resourceStatus, nil
 }
 
-func SetResourceStatus(pod *corev1.Pod, status *ResourceStatus) error {
-	if pod == nil {
+func SetResourceStatus(obj metav1.Object, status *ResourceStatus) error {
+	if obj == nil {
 		return nil
 	}
-	if pod.Annotations == nil {
-		pod.Annotations = map[string]string{}
+	annotations := obj.GetAnnotations()
+	if annotations == nil {
+		annotations = map[string]string{}
 	}
 	data, err := json.Marshal(status)
 	if err != nil {
 		return err
 	}
-	pod.Annotations[AnnotationResourceStatus] = string(data)
+	annotations[AnnotationResourceStatus] = string(data)
+	obj.SetAnnotations(annotations)
 	return nil
 }
 

diff --git a/extension/scheduling.go b/extension/scheduling.go
@@ -22,22 +22,11 @@ import (
 
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/types"
 
 	schedulingv1alpha1 "github.com/koordinator-sh/apis/scheduling/v1alpha1"
 	slov1alpha1 "github.com/koordinator-sh/apis/slo/v1alpha1"
 )
 
-const (
-	// LabelReservationOrder controls the preference logic for Reservation.
-	// Reservation with lower order is preferred to be selected before Reservation with higher order.
-	// But if it is 0, Reservation will be selected according to the capacity score.
-	LabelReservationOrder = SchedulingDomainPrefix + "/reservation-order"
-
-	// AnnotationReservationAllocated represents the reservation allocated by the pod.
-	AnnotationReservationAllocated = SchedulingDomainPrefix + "/reservation-allocated"
-)
-
 const (
 	// AnnotationCustomUsageThresholds represents the user-defined resource utilization threshold.
 	// For specific value definitions, see CustomUsageThresholds
@@ -78,6 +67,13 @@ const (
 	GangModeNonStrict = "NonStrict"
 )
 
+const (
+	// Deprecated: kubernetes-sigs/scheduler-plugins/lightweight-coscheduling
+	LabelLightweightCoschedulingPodGroupName = "pod-group.scheduling.sigs.k8s.io/name"
+	// Deprecated: kubernetes-sigs/scheduler-plugins/lightweight-coscheduling
+	LabelLightweightCoschedulingPodGroupMinAvailable = "pod-group.scheduling.sigs.k8s.io/min-available"
+)
+
 // CustomUsageThresholds supports user-defined node resource utilization thresholds.
 type CustomUsageThresholds struct {
 	// UsageThresholds indicates the resource utilization threshold of the whole machine.
@@ -110,51 +106,6 @@ func GetCustomUsageThresholds(node *corev1.Node) (*CustomUsageThresholds, error)
 	return usageThresholds, nil
 }
 
-type ReservationAllocated struct {
-	Name string    `json:"name,omitempty"`
-	UID  types.UID `json:"uid,omitempty"`
-}
-
-func GetReservationAllocated(pod *corev1.Pod) (*ReservationAllocated, error) {
-	if pod.Annotations == nil {
-		return nil, nil
-	}
-	data, ok := pod.Annotations[AnnotationReservationAllocated]
-	if !ok {
-		return nil, nil
-	}
-	reservationAllocated := &ReservationAllocated{}
-	err := json.Unmarshal([]byte(data), reservationAllocated)
-	if err != nil {
-		return nil, err
-	}
-	return reservationAllocated, nil
-}
-
-func SetReservationAllocated(pod *corev1.Pod, r *schedulingv1alpha1.Reservation) {
-	if pod.Annotations == nil {
-		pod.Annotations = map[string]string{}
-	}
-	reservationAllocated := &ReservationAllocated{
-		Name: r.Name,
-		UID:  r.UID,
-	}
-	data, _ := json.Marshal(reservationAllocated) // assert no error
-	pod.Annotations[AnnotationReservationAllocated] = string(data)
-}
-
-func RemoveReservationAllocated(pod *corev1.Pod, r *schedulingv1alpha1.Reservation) (bool, error) {
-	reservationAllocated, err := GetReservationAllocated(pod)
-	if err != nil {
-		return false, err
-	}
-	if reservationAllocated != nil && reservationAllocated.Name == r.Name && reservationAllocated.UID == r.UID {
-		delete(pod.Annotations, AnnotationReservationAllocated)
-		return true, nil
-	}
-	return false, nil
-}
-
 // DeviceAllocations would be injected into Pod as form of annotation during Pre-bind stage.
 /*
 {
@@ -183,9 +134,10 @@ type DeviceAllocations map[schedulingv1alpha1.DeviceType][]*DeviceAllocation
 type DeviceAllocation struct {
 	Minor     int32               `json:"minor"`
 	Resources corev1.ResourceList `json:"resources"`
+	Extension json.RawMessage     `json:"extension,omitempty"`
 }
 
-func GetDeviceAllocations(podAnnotations map[string]string) (DeviceAllocations, error) {
+var GetDeviceAllocations = func(podAnnotations map[string]string) (DeviceAllocations, error) {
 	deviceAllocations := DeviceAllocations{}
 	data, ok := podAnnotations[AnnotationDeviceAllocated]
 	if !ok {
@@ -198,17 +150,19 @@ func GetDeviceAllocations(podAnnotations map[string]string) (DeviceAllocations,
 	return deviceAllocations, nil
 }
 
-func SetDeviceAllocations(pod *corev1.Pod, allocations DeviceAllocations) error {
-	if pod.Annotations == nil {
-		pod.Annotations = map[string]string{}
+func SetDeviceAllocations(obj metav1.Object, allocations DeviceAllocations) error {
+	annotations := obj.GetAnnotations()
+	if annotations == nil {
+		annotations = map[string]string{}
 	}
 
 	data, err := json.Marshal(allocations)
 	if err != nil {
 		return err
 	}
 
-	pod.Annotations[AnnotationDeviceAllocated] = string(data)
+	annotations[AnnotationDeviceAllocated] = string(data)
+	obj.SetAnnotations(annotations)
 	return nil
 }
 

diff --git a/scheduling/v1alpha1/pod_migration_job_types.go b/scheduling/v1alpha1/pod_migration_job_types.go
@@ -163,6 +163,7 @@ const (
 	PodMigrationJobConditionEviction                       PodMigrationJobConditionType = "Eviction"
 	PodMigrationJobConditionPodScheduled                   PodMigrationJobConditionType = "PodScheduled"
 	PodMigrationJobConditionReservationPodBoundReservation PodMigrationJobConditionType = "PodBoundReservation"
+	PodMigrationJobConditionBoundPodReady                  PodMigrationJobConditionType = "BoundPodReady"
 	PodMigrationJobConditionReservationBound               PodMigrationJobConditionType = "ReservationBound"
 )
 
@@ -181,6 +182,7 @@ const (
 	PodMigrationJobReasonFailedEvict               = "FailedEvict"
 	PodMigrationJobReasonEvictComplete             = "EvictComplete"
 	PodMigrationJobReasonWaitForPodBindReservation = "WaitForPodBindReservation"
+	PodMigrationJobReasonWaitForBoundPodReady      = "WaitForBoundPodReady"
 )
 
 type PodMigrationJobConditionStatus string

diff --git a/scheduling/v1alpha1/reservation_types.go b/scheduling/v1alpha1/reservation_types.go
@@ -57,9 +57,9 @@ type ReservationSpec struct {
 	// reservation would be waiting to be available until free resources are sufficient.
 	// +optional
 	PreAllocation bool `json:"preAllocation,omitempty"`
-	// By default, reserved resources are always allocatable as long as the reservation phase is Available. When
-	// `AllocateOnce` is set, the reserved resources are only available for the first owner who allocates successfully
-	// and are not allocatable to other owners anymore.
+	// When `AllocateOnce` is set, the reserved resources are only available for the first owner who allocates successfully
+	// and are not allocatable to other owners anymore. Defaults to true.
+	// +kubebuilder:default=true
 	// +optional
 	AllocateOnce bool `json:"allocateOnce,omitempty"`
 }

diff --git a/slo/v1alpha1/nodemetric_types.go b/slo/v1alpha1/nodemetric_types.go
@@ -49,6 +49,8 @@ type PodMetricInfo struct {
 	Name      string      `json:"name,omitempty"`
 	Namespace string      `json:"namespace,omitempty"`
 	PodUsage  ResourceMap `json:"podUsage,omitempty"`
+	// Third party extensions for PodMetric
+	Extensions *ExtensionsMap `json:"extensions,omitempty"`
 }
 
 // NodeMetricSpec defines the desired state of NodeMetric

diff --git a/slo/v1alpha1/nodeslo_types.go b/slo/v1alpha1/nodeslo_types.go
@@ -175,13 +175,17 @@ type ResourceThresholdStrategy struct {
 	// +kubebuilder:validation:Minimum=0
 	MemoryEvictLowerPercent *int64 `json:"memoryEvictLowerPercent,omitempty"`
 
-	// if be CPU RealLimit/allocatedLimit > CPUEvictBESatisfactionUpperPercent/100, then stop evict BE pods
+	// be.satisfactionRate = be.CPURealLimit/be.CPURequest
+	// if be.satisfactionRate > CPUEvictBESatisfactionUpperPercent/100, then stop to evict.
 	CPUEvictBESatisfactionUpperPercent *int64 `json:"cpuEvictBESatisfactionUpperPercent,omitempty"`
-	// if be CPU (RealLimit/allocatedLimit < CPUEvictBESatisfactionLowerPercent/100 and usage >= CPUEvictBEUsageThresholdPercent/100) continue CPUEvictTimeWindowSeconds, then start evict
+	// be.satisfactionRate = be.CPURealLimit/be.CPURequest; be.cpuUsage = be.CPUUsed/be.CPURealLimit
+	// if be.satisfactionRate < CPUEvictBESatisfactionLowerPercent/100 && be.usage >= CPUEvictBEUsageThresholdPercent/100,
+	// then start to evict pod, and will evict to ${CPUEvictBESatisfactionUpperPercent}
 	CPUEvictBESatisfactionLowerPercent *int64 `json:"cpuEvictBESatisfactionLowerPercent,omitempty"`
-	// if be CPU (RealLimit/allocatedLimit < CPUEvictBESatisfactionLowerPercent/100 and usage >= CPUEvictBEUsageThresholdPercent/100) continue CPUEvictTimeWindowSeconds, then start evict
+	// if be.cpuUsage >= CPUEvictBEUsageThresholdPercent/100, then start to calculate the resources need to be released.
 	CPUEvictBEUsageThresholdPercent *int64 `json:"cpuEvictBEUsageThresholdPercent,omitempty"`
-	// cpu evict start after continue avg(cpuusage) > CPUEvictThresholdPercent in seconds
+	// when avg(cpuusage) > CPUEvictThresholdPercent, will start to evict pod by cpu,
+	// and avg(cpuusage) is calculated based on the most recent CPUEvictTimeWindowSeconds data
 	CPUEvictTimeWindowSeconds *int64 `json:"cpuEvictTimeWindowSeconds,omitempty"`
 }
 
@@ -238,6 +242,15 @@ type CPUBurstStrategy struct {
 	SharePoolThresholdPercent *int64 `json:"sharePoolThresholdPercent,omitempty"`
 }
 
+type SystemStrategy struct {
+	// for /proc/sys/vm/min_free_kbytes, min_free_kbytes = minFreeKbytesFactor * nodeTotalMemory /10000
+	MinFreeKbytesFactor *int64 `json:"minFreeKbytesFactor,omitempty"`
+	// /proc/sys/vm/watermark_scale_factor
+	WatermarkScaleFactor *int64 `json:"watermarkScaleFactor,omitempty"`
+	// /sys/kernel/mm/memcg_reaper/reap_background
+	MemcgReapBackGround *int64 `json:"memcgReapBackGround,omitempty"`
+}
+
 // NodeSLOSpec defines the desired state of NodeSLO
 type NodeSLOSpec struct {
 	// BE pods will be limited if node resource usage overload
@@ -246,6 +259,8 @@ type NodeSLOSpec struct {
 	ResourceQOSStrategy *ResourceQOSStrategy `json:"resourceQOSStrategy,omitempty"`
 	// CPU Burst Strategy
 	CPUBurstStrategy *CPUBurstStrategy `json:"cpuBurstStrategy,omitempty"`
+	//node global system config
+	SystemStrategy *SystemStrategy `json:"systemStrategy,omitempty"`
 	// Third party extensions for NodeSLO
 	Extensions *ExtensionsMap `json:"extensions,omitempty"`
 }