Skip to content

Commit

Permalink
Add TPU CI infrastructure (#5817)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbzomowski authored and golechwierowicz committed Jan 12, 2024
1 parent c7a6c6c commit 45aa70e
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 0 deletions.
7 changes: 7 additions & 0 deletions infra/terraform_modules/arc_v4_container_cluster/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Cluster creation for TPU CI for PyTorch/XLA

This module configures:
* A regional GKE cluster
* A CPU node pool
* An autoscaling v4 TPU node pool
* The installation of Actions Runner Controller (ARC) on the GKE cluster
18 changes: 18 additions & 0 deletions infra/terraform_modules/arc_v4_container_cluster/arc-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
githubConfigUrl: ${github_repo_url}
githubConfigSecret: github-pat
minRunners: 0
maxRunners: ${max_tpu_nodes}
template:
spec:
containers:
- name: runner
image: ${runner_image}
command: ["/home/runner/run.sh"]
resources:
limits:
google.com/tpu: 4
requests:
google.com/tpu: 4
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
cloud.google.com/gke-tpu-topology: 2x2x1
95 changes: 95 additions & 0 deletions infra/terraform_modules/arc_v4_container_cluster/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
provider "google" {
project = var.project_id
}

provider "helm" {
kubernetes {
host = "https://${google_container_cluster.arc_v4_cluster.endpoint}"
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(google_container_cluster.arc_v4_cluster.master_auth.0.cluster_ca_certificate)
}
}

data "google_client_config" "default" {}

resource "google_container_cluster" "arc_v4_cluster" {
name = var.cluster_name
location = "us-central2"

remove_default_node_pool = true
initial_node_count = 1

release_channel {
channel = "RAPID"
}

min_master_version = 1.28
}

resource "google_container_node_pool" "arc_v4_cpu_nodes" {
name = var.cpu_nodepool_name
location = "us-central2"
cluster = google_container_cluster.arc_v4_cluster.name
node_count = var.cpu_node_count

node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
]
}

management {
auto_upgrade = true
auto_repair = true
}
}

resource "google_container_node_pool" "arc_v4_tpu_nodes" {
name = var.tpu_nodepool_name
location = "us-central2"
node_locations = ["us-central2-b"]
cluster = google_container_cluster.arc_v4_cluster.name
initial_node_count = 0
autoscaling {
total_min_node_count = 0
total_max_node_count = var.max_tpu_nodes
location_policy = "ANY"
}
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
]
machine_type = "ct4p-hightpu-4t"
}
management {
auto_upgrade = true
auto_repair = true
}
}

resource "helm_release" "arc" {
name = "actions-runner-controller"
chart = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller"
namespace = var.arc_namespace
create_namespace = true
}

resource "helm_release" "arc_runner_set" {
name = "v4-runner-set"
depends_on = [
helm_release.arc
]
chart = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
namespace = var.runner_namespace
create_namespace = true

values = [
templatefile("../terraform_modules/arc_v4_container_cluster/arc-values.yaml", {
github_repo_url = var.github_repo_url
max_tpu_nodes = var.max_tpu_nodes
runner_image = var.runner_image
})
]
}
51 changes: 51 additions & 0 deletions infra/terraform_modules/arc_v4_container_cluster/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
variable "cluster_name" {
description = "Name of the Container Cluster containing the v4 node pool"
type = string
}

variable "cpu_nodepool_name" {
description = "Name of the CPU Nodepool"
type = string
}

variable "cpu_node_count" {
description = "Number of CPU nodes"
type = number
}

variable "tpu_nodepool_name" {
description = "Name of the TPU Nodepool"
type = string
}

variable "max_tpu_nodes" {
description = "Maximum number of TPU nodes and runners"
type = number
}

variable "arc_namespace" {
description = "The namespace where ARC will reside"
default = "arc-systems"
type = string
}

variable "runner_namespace" {
description = "The namespace where the ARC runners will reside"
default = "arc-runners"
type = string
}

variable "github_repo_url" {
description = "The full URL of the repository which will be utilizing the self-hosted runners in ARC"
type = string
}

variable "project_id" {
description = "The project ID"
type = string
}

variable "runner_image" {
description = "The Docker image used in the self-hosted runner"
type = string
}
11 changes: 11 additions & 0 deletions infra/tpu-pytorch/tpu_ci.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module "v4_arc_cluster" {
source = "../terraform_modules/arc_v4_container_cluster"
project_id = "tpu-pytorch"
cluster_name = "tpu-ci"
cpu_nodepool_name = "cpu-nodepool"
cpu_node_count = 1
tpu_nodepool_name = "tpu-nodepool"
max_tpu_nodes = 1
github_repo_url = "https://github.com/pytorch/xla"
runner_image = "gcr.io/tpu-pytorch/tpu-ci-runner:latest"
}

0 comments on commit 45aa70e

Please sign in to comment.