Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow provisioner to be configured to force on-demand nodes & disable auto-upgrade #656

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions tpu-provisioner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ func main() {
GCPNodeSecondaryDisk string `envconfig:"GCP_NODE_SECONDARY_DISK" default:""`
GCPNodeSecureBoot bool `envconfig:"GCP_NODE_SECURE_BOOT" default:"true"`

// GCPForceOnDemand forces the controller to create nodes on demand, even if
// the Pod requests a reservation or spot.
GCPForceOnDemand bool `envconfig:"GCP_FORCE_ON_DEMAND" default:"false"`

// NodeMinLifespan is the amount of time that should pass between a Node object
// creation and a cleanup of that Node. This needs to be long enough to allow
// the node to become Ready and for a pending Pod to be scheduled on it.
Expand Down Expand Up @@ -203,6 +207,7 @@ func main() {
NodeSecondaryDisk: cfg.GCPNodeSecondaryDisk,
NodeTags: cfg.GCPNodeTags,
NodeSecureBoot: cfg.GCPNodeSecureBoot,
ForceOnDemand: cfg.GCPForceOnDemand,
},
Recorder: mgr.GetEventRecorderFor("tpu-provisioner"),
}
Expand Down
43 changes: 23 additions & 20 deletions tpu-provisioner/internal/cloud/gke.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,27 +276,30 @@ func (g *GKE) nodePoolForPod(name string, p *corev1.Pod) (*containerv1beta1.Node
}

var reservation *containerv1beta1.ReservationAffinity
if resName, ok := p.Spec.NodeSelector["cloud.google.com/reservation-name"]; ok {
reservation = &containerv1beta1.ReservationAffinity{
ConsumeReservationType: "SPECIFIC_RESERVATION",
Key: "compute.googleapis.com/reservation-name",
Values: []string{
resName,
},
}
}

var taints []*containerv1beta1.NodeTaint
var spot bool

if !g.ClusterContext.ForceOnDemand {
if resName, ok := p.Spec.NodeSelector["cloud.google.com/reservation-name"]; ok {
reservation = &containerv1beta1.ReservationAffinity{
ConsumeReservationType: "SPECIFIC_RESERVATION",
Key: "compute.googleapis.com/reservation-name",
Values: []string{
resName,
},
}
}

spot := p.Spec.NodeSelector["cloud.google.com/gke-spot"] == "true"
if spot {
// Add the taint that NAP would add.
// https://cloud.google.com/kubernetes-engine/docs/concepts/spot-vms#spotvms-nap
taints = append(taints, &containerv1beta1.NodeTaint{
Key: "cloud.google.com/gke-spot",
Value: "true",
Effect: "NO_SCHEDULE",
})
spot = p.Spec.NodeSelector["cloud.google.com/gke-spot"] == "true"
if spot {
// Add the taint that NAP would add.
// https://cloud.google.com/kubernetes-engine/docs/concepts/spot-vms#spotvms-nap
taints = append(taints, &containerv1beta1.NodeTaint{
Key: "cloud.google.com/gke-spot",
Value: "true",
Effect: "NO_SCHEDULE",
})
}
}

var secondaryDisks []*containerv1beta1.SecondaryBootDisk
Expand Down Expand Up @@ -336,7 +339,7 @@ func (g *GKE) nodePoolForPod(name string, p *corev1.Pod) (*containerv1beta1.Node
},
Management: &containerv1beta1.NodeManagement{
AutoRepair: true,
AutoUpgrade: true,
AutoUpgrade: false,
},
UpgradeSettings: &containerv1beta1.UpgradeSettings{
MaxSurge: 1,
Expand Down
1 change: 1 addition & 0 deletions tpu-provisioner/internal/cloud/gke_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ type GKEContext struct {
NodeSecondaryDisk string
NodeTags []string
NodeSecureBoot bool
ForceOnDemand bool
}

func (c GKEContext) ClusterName() string {
Expand Down
95 changes: 91 additions & 4 deletions tpu-provisioner/internal/cloud/gke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,67 @@ func TestNodePoolForPod(t *testing.T) {
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
},
},
{
desc: "spot",
selector: map[string]string{
"cloud.google.com/gke-spot": "true",
},
want: &containerv1beta1.NodePool{
Config: &container.NodeConfig{
Labels: map[string]string{
"google.com/nodepool-manager": "tpu-provisioner",
"google.com/tpu-provisioner-jobset-name": "jobset-test",
"google.com/tpu-provisioner-jobset-namespace": "default",
"google.com/tpu-provisioner-parent-kind": "job",
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
"google.com/tpu-provisioner-parent-namespace": "default",
},
MachineType: "ct5p-hightpu-4t",
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
Spot: true,
Taints: []*container.NodeTaint{
{Effect: "NO_SCHEDULE", Key: "cloud.google.com/gke-spot", Value: "true"},
},
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
},
},
{
desc: "spot with forced on demand",
gkeContext: GKEContext{ForceOnDemand: true},
selector: map[string]string{
"cloud.google.com/gke-spot": "true",
},
want: &containerv1beta1.NodePool{
Config: &container.NodeConfig{
Labels: map[string]string{
"google.com/nodepool-manager": "tpu-provisioner",
"google.com/tpu-provisioner-jobset-name": "jobset-test",
"google.com/tpu-provisioner-jobset-namespace": "default",
"google.com/tpu-provisioner-parent-kind": "job",
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
"google.com/tpu-provisioner-parent-namespace": "default",
},
MachineType: "ct5p-hightpu-4t",
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
Spot: false,
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
Expand Down Expand Up @@ -272,7 +332,34 @@ func TestNodePoolForPod(t *testing.T) {
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
},
},
{
desc: "pod with reservation selector but on demand is forced",
selector: map[string]string{"cloud.google.com/reservation-name": "tpu-rsv"},
gkeContext: GKEContext{ForceOnDemand: true},
want: &containerv1beta1.NodePool{
Config: &container.NodeConfig{
Labels: map[string]string{
"google.com/nodepool-manager": "tpu-provisioner",
"google.com/tpu-provisioner-jobset-name": "jobset-test",
"google.com/tpu-provisioner-jobset-namespace": "default",
"google.com/tpu-provisioner-parent-kind": "job",
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
"google.com/tpu-provisioner-parent-namespace": "default",
},
MachineType: "ct5p-hightpu-4t",
ReservationAffinity: nil,
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
Expand All @@ -298,7 +385,7 @@ func TestNodePoolForPod(t *testing.T) {
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
Expand Down Expand Up @@ -329,7 +416,7 @@ func TestNodePoolForPod(t *testing.T) {
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
Expand Down