Delay scale-up including GPU request

Nodes with GPU are expensive and it's likely a bunch of pods using them will be created in a batch. In this case we can wait a bit for all pods to be created to make more efficient scale-up decision.
kubernetes · Mar 5, 2018 · 930c210 · 930c210
1 parent 37ff773
commit 930c210
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 1 deletion.
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -26,6 +26,8 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
+
+	apiv1 "k8s.io/api/core/v1"
 	kube_client "k8s.io/client-go/kubernetes"
 	kube_record "k8s.io/client-go/tools/record"
 
@@ -35,6 +37,10 @@ import (
 const (
 	// How old the oldest unschedulable pod should be before starting scale up.
 	unschedulablePodTimeBuffer = 2 * time.Second
+	// How old the oldest unschedulable pod with GPU should be before starting scale up.
+	// The idea is that nodes with GPU are very expensive and we're ready to sacrifice
+	// a bit more latency to wait for more pods and make a more informed scale-up decision.
+	unschedulablePodWithGpuTimeBuffer = 30 * time.Second
 )
 
 // StaticAutoscaler is an autoscaler which has all the core functionality of a CA but without the reconfiguration feature
@@ -251,7 +257,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		glog.V(1).Info("No unschedulable pods")
 	} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
 		glog.V(1).Info("Max total nodes in cluster reached")
-	} else if getOldestCreateTime(unschedulablePodsToHelp).Add(unschedulablePodTimeBuffer).After(currentTime) {
+	} else if allPodsAreNew(unschedulablePodsToHelp, currentTime) {
 		// The assumption here is that these pods have been created very recently and probably there
 		// is more pods to come. In theory we could check the newest pod time but then if pod were created
 		// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
@@ -361,3 +367,11 @@ func (a *StaticAutoscaler) ExitCleanUp() {
 	}
 	utils.DeleteStatusConfigMap(a.AutoscalingContext.ClientSet, a.AutoscalingContext.ConfigNamespace)
 }
+
+func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
+	if getOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
+		return true
+	}
+	found, oldest := getOldestCreateTimeWithGpu(pods)
+	return found && oldest.Add(unschedulablePodWithGpuTimeBuffer).After(currentTime)
+}
diff --git a/cluster-autoscaler/core/utils.go b/cluster-autoscaler/core/utils.go
@@ -32,6 +32,7 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/utils/deletetaint"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
 
@@ -501,3 +502,17 @@ func getOldestCreateTime(pods []*apiv1.Pod) time.Time {
 	}
 	return oldest
 }
+
+func getOldestCreateTimeWithGpu(pods []*apiv1.Pod) (bool, time.Time) {
+	oldest := time.Now()
+	gpuFound := false
+	for _, pod := range pods {
+		if gpu.PodRequestsGpu(pod) {
+			gpuFound = true
+			if oldest.After(pod.CreationTimestamp.Time) {
+				oldest = pod.CreationTimestamp.Time
+			}
+		}
+	}
+	return gpuFound, oldest
+}
diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go
@@ -96,3 +96,16 @@ func NodeHasGpu(node *apiv1.Node) bool {
 	gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU]
 	return hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero())
 }
+
+// PodRequestsGpu returns true if a given pod has GPU request.
+func PodRequestsGpu(pod *apiv1.Pod) bool {
+	for _, container := range pod.Spec.Containers {
+		if container.Resources.Requests != nil {
+			_, gpuFound := container.Resources.Requests[ResourceNvidiaGPU]
+			if gpuFound {
+				return true
+			}
+		}
+	}
+	return false
+}
diff --git a/cluster-autoscaler/utils/gpu/gpu_test.go b/cluster-autoscaler/utils/gpu/gpu_test.go
@@ -24,6 +24,7 @@ import (
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/test"
 
 	"github.com/stretchr/testify/assert"
 )
@@ -192,3 +193,12 @@ func TestNodeHasGpu(t *testing.T) {
 	}
 	assert.False(t, NodeHasGpu(nodeNoGpu))
 }
+
+func TestPodRequestsGpu(t *testing.T) {
+	podNoGpu := test.BuildTestPod("podNoGpu", 0, 1000)
+	podWithGpu := test.BuildTestPod("pod1AnyGpu", 0, 1000)
+	podWithGpu.Spec.Containers[0].Resources.Requests[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
+
+	assert.False(t, PodRequestsGpu(podNoGpu))
+	assert.True(t, PodRequestsGpu(podWithGpu))
+}