Skip to content

Commit

Permalink
Delay scale-up including GPU request
Browse files Browse the repository at this point in the history
Nodes with GPU are expensive and it's likely a bunch of pods
using them will be created in a batch. In this case we can
wait a bit for all pods to be created to make more efficient
scale-up decision.
  • Loading branch information
MaciekPytel committed Mar 5, 2018
1 parent 37ff773 commit 930c210
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 1 deletion.
16 changes: 15 additions & 1 deletion cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"

apiv1 "k8s.io/api/core/v1"
kube_client "k8s.io/client-go/kubernetes"
kube_record "k8s.io/client-go/tools/record"

Expand All @@ -35,6 +37,10 @@ import (
const (
// How old the oldest unschedulable pod should be before starting scale up.
unschedulablePodTimeBuffer = 2 * time.Second
// How old the oldest unschedulable pod with GPU should be before starting scale up.
// The idea is that nodes with GPU are very expensive and we're ready to sacrifice
// a bit more latency to wait for more pods and make a more informed scale-up decision.
unschedulablePodWithGpuTimeBuffer = 30 * time.Second
)

// StaticAutoscaler is an autoscaler which has all the core functionality of a CA but without the reconfiguration feature
Expand Down Expand Up @@ -251,7 +257,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
glog.V(1).Info("No unschedulable pods")
} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
glog.V(1).Info("Max total nodes in cluster reached")
} else if getOldestCreateTime(unschedulablePodsToHelp).Add(unschedulablePodTimeBuffer).After(currentTime) {
} else if allPodsAreNew(unschedulablePodsToHelp, currentTime) {
// The assumption here is that these pods have been created very recently and probably there
// is more pods to come. In theory we could check the newest pod time but then if pod were created
// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
Expand Down Expand Up @@ -361,3 +367,11 @@ func (a *StaticAutoscaler) ExitCleanUp() {
}
utils.DeleteStatusConfigMap(a.AutoscalingContext.ClientSet, a.AutoscalingContext.ConfigNamespace)
}

func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
if getOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
return true
}
found, oldest := getOldestCreateTimeWithGpu(pods)
return found && oldest.Add(unschedulablePodWithGpuTimeBuffer).After(currentTime)
}
15 changes: 15 additions & 0 deletions cluster-autoscaler/core/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/utils/deletetaint"
"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"

Expand Down Expand Up @@ -501,3 +502,17 @@ func getOldestCreateTime(pods []*apiv1.Pod) time.Time {
}
return oldest
}

func getOldestCreateTimeWithGpu(pods []*apiv1.Pod) (bool, time.Time) {
oldest := time.Now()
gpuFound := false
for _, pod := range pods {
if gpu.PodRequestsGpu(pod) {
gpuFound = true
if oldest.After(pod.CreationTimestamp.Time) {
oldest = pod.CreationTimestamp.Time
}
}
}
return gpuFound, oldest
}
13 changes: 13 additions & 0 deletions cluster-autoscaler/utils/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,16 @@ func NodeHasGpu(node *apiv1.Node) bool {
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU]
return hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero())
}

// PodRequestsGpu returns true if a given pod has GPU request.
func PodRequestsGpu(pod *apiv1.Pod) bool {
for _, container := range pod.Spec.Containers {
if container.Resources.Requests != nil {
_, gpuFound := container.Resources.Requests[ResourceNvidiaGPU]
if gpuFound {
return true
}
}
}
return false
}
10 changes: 10 additions & 0 deletions cluster-autoscaler/utils/gpu/gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/utils/test"

"github.com/stretchr/testify/assert"
)
Expand Down Expand Up @@ -192,3 +193,12 @@ func TestNodeHasGpu(t *testing.T) {
}
assert.False(t, NodeHasGpu(nodeNoGpu))
}

func TestPodRequestsGpu(t *testing.T) {
podNoGpu := test.BuildTestPod("podNoGpu", 0, 1000)
podWithGpu := test.BuildTestPod("pod1AnyGpu", 0, 1000)
podWithGpu.Spec.Containers[0].Resources.Requests[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)

assert.False(t, PodRequestsGpu(podNoGpu))
assert.True(t, PodRequestsGpu(podWithGpu))
}

0 comments on commit 930c210

Please sign in to comment.