Skip to content

Commit

Permalink
[TPU Provisioner] Add support for location hint label (#666)
Browse files Browse the repository at this point in the history
* add support for location hint label
  • Loading branch information
danielvegamyhre authored May 9, 2024
1 parent 89b0361 commit 547ef6b
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 9 deletions.
29 changes: 20 additions & 9 deletions tpu-provisioner/internal/cloud/gke.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,15 @@ const (
GKETPUNodeSelector = "cloud.google.com/gke-tpu-topology"
GKEAcceleratorNodeSelector = "cloud.google.com/gke-tpu-accelerator"
GKENodePoolNameLabel = "cloud.google.com/gke-nodepool"
ICIResiliencyLabel = "cloud.google.com/gke-tpu-ici-resiliency"

// ICIResiliencyLabel is used for disabling ICI resiliency, by default if not specified TPU slice
// is created in the ICI resilient mode. To disable the ICI resilient, workload needs
// to use node selector or affinity cloud.google.com/gke-tpu-ici-resiliency=false.
ICIResiliencyLabel = "cloud.google.com/gke-tpu-ici-resiliency"

// LocationHintLabel is used for passing in a desired borg cell the node pool MIG should be
// provisioned in.
LocationHintLabel = "cloud.google.com/gke-location-hint"

// Supported accelerator types
V4PodSliceAccelerator = "tpu-v4-podslice"
Expand Down Expand Up @@ -250,14 +258,17 @@ func (g *GKE) nodePoolForPod(name string, p *corev1.Pod) (*containerv1beta1.Node
LabelJobSetNamespace: p.Namespace,
}

for k, v := range p.Spec.NodeSelector {
// Don't copy GCP/Google labels onto the node.
if (!strings.HasPrefix(k, gcpLabelPrefix) && !strings.HasPrefix(k, googleLabelPrefix)) ||
// Special label used for disabling ICI resiliency, by default if not specified TPU slice
// is created in the ICI resilient mode. To disable the ICI resilient, workload needs
// to use node selector or affinity cloud.google.com/gke-tpu-ici-resiliency=false.
(k == ICIResiliencyLabel) {
labels[k] = v
for labelKey, labelValue := range p.Spec.NodeSelector {
switch labelKey {
case ICIResiliencyLabel:
labels[labelKey] = labelValue
case LocationHintLabel:
labels[labelKey] = labelValue
default:
// Don't copy GCP/Google labels onto the node.
if !strings.HasPrefix(labelKey, gcpLabelPrefix) && !strings.HasPrefix(labelKey, googleLabelPrefix) {
labels[labelKey] = labelValue
}
}
}

Expand Down
26 changes: 26 additions & 0 deletions tpu-provisioner/internal/cloud/gke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,32 @@ func TestNodePoolForPod(t *testing.T) {
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
},
},
{
desc: "pod with location hint node selector",
selector: map[string]string{"cloud.google.com/gke-location-hint": "test-location-hint"},
want: &containerv1beta1.NodePool{
Config: &container.NodeConfig{
Labels: map[string]string{
"google.com/nodepool-manager": "tpu-provisioner",
"google.com/tpu-provisioner-jobset-name": "jobset-test",
"google.com/tpu-provisioner-jobset-namespace": "default",
"google.com/tpu-provisioner-parent-kind": "job",
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
"google.com/tpu-provisioner-parent-namespace": "default",
"cloud.google.com/gke-location-hint": "test-location-hint",
},
MachineType: "ct5p-hightpu-4t",
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
},
},
}
for _, tc := range tests {
t.Run(tc.desc, func(t *testing.T) {
Expand Down

0 comments on commit 547ef6b

Please sign in to comment.