Merge pull request #806 from spiffxp/prow-build-test

Add infra scripts/terraform for prow build cluster prototype
kubernetes · Apr 28, 2020 · 84e0ed5 · 84e0ed5
2 parents e648557 + d58a30f
commit 84e0ed5
Show file tree

Hide file tree

Showing 8 changed files with 464 additions and 18 deletions.
diff --git a/infra/gcp/clusters/kubernetes-public/prow-build-test/00-inputs.tf b/infra/gcp/clusters/kubernetes-public/prow-build-test/00-inputs.tf
@@ -0,0 +1,26 @@
+/*
+This file defines:
+- Required Terraform version
+- Required provider versions
+- Storage backend details
+- GCP project configuration
+*/
+
+terraform {
+  required_version = ">= 0.12.8"
+
+  backend "gcs" {
+    bucket = "k8s-infra-clusters-terraform"
+    prefix = "kubernetes-public/prow-build-test" // $project_name/$cluster_name
+  }
+
+  required_providers {
+    google      = "~> 2.14"
+    google-beta = "~> 2.14"
+  }
+}
+
+// This configures the source project where we should install the cluster
+data "google_project" "project" {
+  project_id = "kubernetes-public"
+}
diff --git a/infra/gcp/clusters/kubernetes-public/prow-build-test/10-cluster-configuration.tf b/infra/gcp/clusters/kubernetes-public/prow-build-test/10-cluster-configuration.tf
@@ -0,0 +1,156 @@
+/*
+This file defines:
+- GCP Service Account for nodes
+- Bigquery dataset for usage metering
+- GKE cluster configuration
+
+Note that it does not configure any node pools; this is done in a separate file.
+*/
+
+locals {
+  cluster_name      = "prow-build-test" // This is the name of the cluster defined in this file
+  cluster_location  = "us-central1"     // This is the GCP location (region or zone) where the cluster should be created
+  bigquery_location = "US"              // This is the bigquery specific location where the dataset should be created
+}
+
+// Create SA for nodes
+resource "google_service_account" "cluster_node_sa" {
+  project      = data.google_project.project.id
+  account_id   = "gke-nodes-${local.cluster_name}"
+  display_name = "Nodes in GKE cluster '${local.cluster_name}'"
+}
+
+// Add roles for SA
+resource "google_project_iam_member" "cluster_node_sa_logging" {
+  project = data.google_project.project.id
+  role    = "roles/logging.logWriter"
+  member  = "serviceAccount:${google_service_account.cluster_node_sa.email}"
+}
+resource "google_project_iam_member" "cluster_node_sa_monitoring_viewer" {
+  project = data.google_project.project.id
+  role    = "roles/monitoring.viewer"
+  member  = "serviceAccount:${google_service_account.cluster_node_sa.email}"
+}
+resource "google_project_iam_member" "cluster_node_sa_monitoring_metricwriter" {
+  project = data.google_project.project.id
+  role    = "roles/monitoring.metricWriter"
+  member  = "serviceAccount:${google_service_account.cluster_node_sa.email}"
+}
+
+// BigQuery dataset for usage data
+resource "google_bigquery_dataset" "usage_metering" {
+  dataset_id  = replace("usage_metering_${local.cluster_name}", "-", "_")
+  project     = data.google_project.project.id
+  description = "GKE Usage Metering for cluster '${local.cluster_name}'"
+  location    = local.bigquery_location
+
+  access {
+    role          = "OWNER"
+    special_group = "projectOwners"
+  }
+  access {
+    role          = "WRITER"
+    user_by_email = google_service_account.cluster_node_sa.email
+  }
+
+  // This restricts deletion of this dataset if there is data in it
+  // IMPORTANT: Should be true on test clusters
+  delete_contents_on_destroy = true
+}
+
+// Create GKE cluster, but with no node pools. Node pools can be provisioned below
+resource "google_container_cluster" "cluster" {
+  name     = local.cluster_name
+  location = local.cluster_location
+
+  provider = google-beta
+  project  = data.google_project.project.id
+
+  // GKE clusters are critical objects and should not be destroyed
+  // IMPORTANT: should be false on test clusters
+  lifecycle {
+    prevent_destroy = false
+  }
+
+  // Network config
+  network = "default"
+
+  // Start with a single node, because we're going to delete the default pool
+  initial_node_count = 1
+
+  // Removes the default node pool, so we can custom create them as separate
+  // objects
+  remove_default_node_pool = true
+
+  // Disable local and certificate auth
+  master_auth {
+    username = ""
+    password = ""
+
+    client_certificate_config {
+      issue_client_certificate = false
+    }
+  }
+
+  // Enable google-groups for RBAC
+  authenticator_groups_config {
+    security_group = "gke-security-groups@kubernetes.io"
+  }
+
+  // Enable workload identity for GCP IAM
+  workload_identity_config {
+    identity_namespace = "${data.google_project.project.id}.svc.id.goog"
+  }
+
+  // Enable Stackdriver Kubernetes Monitoring
+  logging_service    = "logging.googleapis.com/kubernetes"
+  monitoring_service = "monitoring.googleapis.com/kubernetes"
+
+  // Set maintenance time
+  maintenance_policy {
+    daily_maintenance_window {
+      start_time = "11:00" // (in UTC), 03:00 PST
+    }
+  }
+
+  // Restrict master to Google IP space; use Cloud Shell to access
+  master_authorized_networks_config {
+  }
+
+  // Enable GKE Usage Metering
+  resource_usage_export_config {
+    enable_network_egress_metering = true
+    bigquery_destination {
+      dataset_id = google_bigquery_dataset.usage_metering.dataset_id
+    }
+  }
+
+  // Enable GKE Network Policy
+  network_policy {
+    enabled  = true
+    provider = "CALICO"
+  }
+
+  // Configure cluster addons
+  addons_config {
+    horizontal_pod_autoscaling {
+      disabled = false
+    }
+    http_load_balancing {
+      disabled = false
+    }
+    network_policy_config {
+      disabled = false
+    }
+  }
+
+  // Enable PodSecurityPolicy enforcement
+  pod_security_policy_config {
+    enabled = false // TODO: we should turn this on
+  }
+
+  // Enable VPA
+  vertical_pod_autoscaling {
+    enabled = true
+  }
+}
diff --git a/infra/gcp/clusters/kubernetes-public/prow-build-test/11-pool1-configuration.tf b/infra/gcp/clusters/kubernetes-public/prow-build-test/11-pool1-configuration.tf
@@ -0,0 +1,59 @@
+/*
+This file defines:
+- Node pool for pool1
+
+Note: If you wish to create additional node pools, please duplicate this file
+and change the resource name, name_prefix, and any other cluster specific settings.
+*/
+
+resource "google_container_node_pool" "pool1" {
+  name_prefix = "pool1-"
+  location    = google_container_cluster.cluster.location
+  cluster     = google_container_cluster.cluster.name
+
+  provider = google-beta
+  project  = data.google_project.project.id
+
+  // Start with a single node
+  initial_node_count = 1
+
+  // Auto repair, and auto upgrade nodes to match the master version
+  management {
+    auto_repair  = true
+    auto_upgrade = true
+  }
+
+  // Autoscale the cluster as needed. Note that these values will be multiplied
+  // by 3, as the cluster will exist in three zones
+  autoscaling {
+    min_node_count = 1
+    max_node_count = 3
+  }
+
+  // Set machine type, and enable all oauth scopes tied to the service account
+  node_config {
+    // k8s-prow-builds uses n1-highmem-8
+    machine_type = "n1-highmem-2"
+    // k8s-prow-builds uses 250
+    disk_size_gb = 100
+    // k8s-prow-builds uses pd-ssd
+    disk_type    = "pd-ssd"
+
+    service_account = google_service_account.cluster_node_sa.email
+    oauth_scopes    = ["https://www.googleapis.com/auth/cloud-platform"]
+
+    // Needed for workload identity
+    workload_metadata_config {
+      node_metadata = "GKE_METADATA_SERVER"
+    }
+    metadata = {
+      disable-legacy-endpoints = "true"
+    }
+  }
+
+  // If we need to destroy the node pool, create the new one before destroying
+  // the old one
+  lifecycle {
+    create_before_destroy = true
+  }
+}
diff --git a/infra/gcp/ensure-e2e-projects.sh b/infra/gcp/ensure-e2e-projects.sh
@@ -0,0 +1,155 @@
+#!/usr/bin/env bash
+#
+# Copyright 2019 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script creates & configures projects intended to be used for e2e
+# testing of kubernetes and managed by boskos
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")
+. "${SCRIPT_DIR}/lib.sh"
+
+function usage() {
+    echo "usage: $0 [repo...]" > /dev/stderr
+    echo "example:" > /dev/stderr
+    echo "  $0 # do all projects" > /dev/stderr
+    echo "  $0 k8s-infra-node-e2e-project # just do one" > /dev/stderr
+    echo > /dev/stderr
+}
+
+## setup service accounts and ips for the prow build cluster
+
+# TODO: replace prow-build-test with actual service account
+PROW_BUILD_SVCACCT=$(svc_acct_email "kubernetes-public" "prow-build-test")
+
+color 6 "Ensuring prow build cluster is empowered"
+(
+color 6 "Ensuring prow build cluster service-account exists"
+ensure_service_account \
+  "kubernetes-public" \
+  "prow-build-test" \
+  "used by prowjobs that run in prow-build-test cluster"
+
+color 6 "Empowering prow build cluster service-account to be used on prow build cluster"
+# the namespace "test-pods" here must match the namespace defined in prow's config.yaml
+# to launch pods defined by prowjobs
+# eg: https://github.com/kubernetes/test-infra/blob/master/config/prow/config.yaml#L73
+empower_ksa_to_svcacct \
+  "kubernetes-public.svc.id.goog[test-pods/prow-build]" \
+  "kubernetes-public" \
+  "${PROW_BUILD_SVCACCT}"
+
+# manual parts: 
+# - create key, add to prow-build-test as service-account secret
+# - gsutil iam ch serviceAccount:$PROW_BUILD_SVCACCT:objectAdmin gs://bashfire-prow
+# - gsutil iam ch serviceAccount:$PROW_BUILD_SVCACCT:objectCreator gs://bashfire-prow
+) 2>&1 | indent
+
+# TODO: replace boskos-janitor-test with actual service account
+BOSKOS_JANITOR_SVCACCT=$(svc_acct_email "kubernetes-public" "boskos-janitor-test")
+
+color 6 "Ensuring boskos-janitor is empowered"
+(
+color 6 "Ensuring boskos-janitor service account exists"
+ensure_service_account \
+  "kubernetes-public" \
+  "boskos-janitor-test" \
+  "used by boskos-janitor in prow-build-test cluster"
+
+color 6 "Empowering boskos-janitor service-account to be used on prow build cluster"
+# the namespace "test-pods" here must match the namespace defined in prows config.yaml
+# to launch pods defined by prowjobs because most prowjobs as-written assume they can
+# talk to either http://boskos (kubetest or bootstrap.py jobs) or 
+# https://boskos.svc.test-pods.cluster.local (some of the cluster-api jobs), and so
+# all boskos components are deployed to this namespace
+empower_ksa_to_svcacct \
+  "kubernetes-public.svc.id.goog[test-pods/boskos-janitor]" \
+  "kubernetes-public" \
+  "${BOSKOS_JANITOR_SVCACCT}"
+
+color 6 "Ensuring external ip address exists for boskos-metrics service in prow build cluster"
+# this is so monitoring.prow.k8s.io is able to scrape metrics from boskos
+# TODO: replace this with a global address used by an ingress
+ensure_regional_address \
+  "kubernetes-public" \
+  "us-central1" \
+  "boskos-metrics" \
+  "to allow monitoring.k8s.prow.io to scrape boskos metrics"
+) 2>&1 | indent
+
+## setup projects to be used by e2e tests for standing up clusters
+
+# TODO: replace spiffxp- projects with actual projects
+E2E_PROJECTS=(
+  # for manual use during node-e2e job migration, eg: --gcp-project=spiffxp-node-e2e-project
+  spiffxp-node-e2e-project
+  # for manual use during job migration, eg: --gcp-project=spiffxp-gce-project
+  spiffxp-gce-project
+  # managed by boskos, part of the gce-project pool, eg: --gcp-project-type=gce-project
+  spiffxp-boskos-project-01
+  spiffxp-boskos-project-02
+  spiffxp-boskos-project-03
+)
+
+if [ $# = 0 ]; then
+    # default to all e2e projects
+    set -- "${E2E_PROJECTS[@]}"
+fi
+
+color 6 "Ensuring e2e projects exist and are appropriately configured"
+for prj; do
+  color 6 "Ensuring e2e project exists and is appropriately configured: ${prj}"
+  (
+    ensure_project "${prj}"
+
+    color 6 "Enabling APIs necessary for kubernetes e2e jobs to use e2e project: ${prj}"
+    enable_api "${prj}" compute.googleapis.com
+    enable_api "${prj}" logging.googleapis.com
+    enable_api "${prj}" storage-component.googleapis.com
+
+    color 6 "Empower prow-build service account to edit e2e project: ${prj}"
+    # TODO: this is what prow.k8s.io uses today, but it is likely over-permissioned, we could
+    #       look into creating a more constrained IAM role and using that instead
+    gcloud \
+      projects add-iam-policy-binding "${prj}" \
+      --member "serviceAccount:${PROW_BUILD_SVCACCT}" \
+      --role roles/editor
+
+    color 6 "Empower boskos-janitor service account to clean e2e project: ${prj}"
+    # TODO: this is what prow.k8s.io uses today, but it is likely over-permissioned, we could
+    #       look into creating a more constrained IAM role and using that instead
+    gcloud \
+      projects add-iam-policy-binding "${prj}" \
+      --member "serviceAccount:${BOSKOS_JANITOR_SVCACCT}" \
+      --role roles/editor
+
+    color 6 "Ensure prow-build prowjobs are able to ssh to instances in e2e project: ${prj}"
+    # TODO: this is what prow.k8s.io does today, we could look into use OS Login instead
+    prow_build_ssh_pubkey="prow:ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCmYxHh/wwcV0P1aChuFLpl28w6DFyc7G5Xrw1F8wH1Re9AdxyemM2bTZ/PhsP3u9VDnNbyOw3UN00VFdumkFLjLf1WQ7Q6rZDlPjlw7urBIvAMqUecY6ae1znqsZ0dMBxOuPXHznlnjLjM5b7O7q5WsQMCA9Szbmz6DsuSyCuX0It2osBTN+8P/Fa6BNh3W8AF60M7L8/aUzLfbXVS2LIQKAHHD8CWqvXhLPuTJ03iSwFvgtAK1/J2XJwUP+OzAFrxj6A9LW5ZZgk3R3kRKr0xT/L7hga41rB1qy8Uz+Xr/PTVMNGW+nmU4bPgFchCK0JBK7B12ZcdVVFUEdpaAiKZ prow"
+
+    # append to project-wide ssh-keys metadata if not present
+    ssh_pubkeys=$(mktemp "/tmp/${prj}-ssh-keys-XXXX")
+    gcloud compute project-info describe --project="${prj}" --format=json | \
+      jq -r '(.commonInstanceMetadata.items//[])[]|select(.key=="ssh-keys").value' > "${ssh_pubkeys}"
+    if ! grep -q "${prow_build_ssh_pubkey}" "${ssh_pubkeys}"; then
+      echo "${prow_build_ssh_pubkey}" >> "${ssh_pubkeys}"
+      gcloud compute project-info add-metadata --project="${prj}" \
+        --metadata-from-file ssh-keys="${ssh_pubkeys}"
+    fi
+  ) 2>&1 | indent
+done 2>&1 | indent