Skip to content

Commit

Permalink
fix(infra): dynamically generate nomad server count in install script (
Browse files Browse the repository at this point in the history
…#981)

<!-- Please make sure there is an issue that this PR is correlated to. -->

## Changes

<!-- If there are frontend changes, please include screenshots. -->
  • Loading branch information
NathanFlurry committed Jul 23, 2024
1 parent 8be472f commit 9c433d8
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 20 deletions.
23 changes: 9 additions & 14 deletions infra/tf/k8s_infra/nomad.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,9 @@
# complicated + adds another point of failure and (b) it doesn't fix the problem with Nomad server addresses changing.

locals {
# !!! DO NOT CHANGE !!!
#
# This value must be 3, 5, or 7. More = better redundancy, but does not make things faster.
#
# See https://developer.hashicorp.com/nomad/tutorials/enterprise/production-reference-architecture-vm-with-consul
nomad_server_count = var.deploy_method_cluster ? 3 : 1

nomad_server_addrs = [for i in range(0, local.nomad_server_count): "127.0.0.1:${6000 + i}"]
nomad_server_addrs = [for i in range(0, var.nomad_server_count): "127.0.0.1:${6000 + i}"]
nomad_server_addrs_escaped = [for addr in local.nomad_server_addrs : "\"${addr}\""]
nomad_server_configmap_data = {
"server.hcl" = <<-EOT
Expand All @@ -36,7 +31,7 @@ locals {
server {
enabled = true
bootstrap_expect = ${local.nomad_server_count}
bootstrap_expect = ${var.nomad_server_count}
server_join {
retry_join = [${join(", ", local.nomad_server_addrs_escaped)}]
Expand Down Expand Up @@ -128,7 +123,7 @@ resource "kubernetes_service" "nomad_server" {
}

resource "kubernetes_service" "nomad_server_indexed" {
count = var.edge_enabled ? local.nomad_server_count : 0
count = var.edge_enabled ? var.nomad_server_count : 0

metadata {
namespace = kubernetes_namespace.nomad.0.metadata.0.name
Expand Down Expand Up @@ -202,7 +197,7 @@ resource "kubernetes_stateful_set" "nomad_server" {
}
}
spec {
replicas = local.nomad_server_count
replicas = var.nomad_server_count

selector {
match_labels = {
Expand Down Expand Up @@ -324,7 +319,7 @@ resource "kubernetes_stateful_set" "nomad_server" {

# Entrypoints
flatten([
for i in range(0, local.nomad_server_count):
for i in range(0, var.nomad_server_count):
[
"--entryPoints.nomad-${i}-rpc-tcp.address=:${5000 + i}/tcp",
"--entryPoints.nomad-${i}-serf-tcp.address=:${6000 + i}/tcp",
Expand All @@ -334,7 +329,7 @@ resource "kubernetes_stateful_set" "nomad_server" {
])

dynamic "port" {
for_each = [for i in range(0, local.nomad_server_count) : i]
for_each = [for i in range(0, var.nomad_server_count) : i]
content {
name = "n-${port.value}-rpc-tcp"
container_port = 5000 + port.value
Expand All @@ -343,7 +338,7 @@ resource "kubernetes_stateful_set" "nomad_server" {
}

dynamic "port" {
for_each = [for i in range(0, local.nomad_server_count) : i]
for_each = [for i in range(0, var.nomad_server_count) : i]
content {
name = "n-${port.value}-serf-tcp"
container_port = 6000 + port.value
Expand All @@ -352,7 +347,7 @@ resource "kubernetes_stateful_set" "nomad_server" {
}

dynamic "port" {
for_each = [for i in range(0, local.nomad_server_count) : i]
for_each = [for i in range(0, var.nomad_server_count) : i]
content {
name = "n-${port.value}-serf-udp"
container_port = 6000 + port.value
Expand Down Expand Up @@ -421,7 +416,7 @@ resource "kubernetes_config_map" "nomad_server_sidecar_traefik_config" {
}

data = {
for i in range(0, local.nomad_server_count):
for i in range(0, var.nomad_server_count):
"nomad-${i}.yaml" => yamlencode({
tcp = {
routers = {
Expand Down
4 changes: 4 additions & 0 deletions infra/tf/k8s_infra/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ variable "authenticate_all_docker_hub_pulls" {
}

# MARK: Nomad
variable "nomad_server_count" {
type = number
}

variable "edge_enabled" {
type = bool
}
Expand Down
12 changes: 12 additions & 0 deletions lib/bolt/core/src/context/project.rs
Original file line number Diff line number Diff line change
Expand Up @@ -900,6 +900,18 @@ impl ProjectContextData {
.and_then(|dns| dns.provider.as_ref())
.is_some()
}

pub fn nomad_server_count(&self) -> usize {
// !!! DO NOT CHANGE !!!
//
// This value must be 1, 3, 5, or 7. More = better redundancy, but does not make things faster.
//
// See https://developer.hashicorp.com/nomad/tutorials/enterprise/production-reference-architecture-vm-with-consul
match self.ns().cluster.kind {
config::ns::ClusterKind::Distributed { .. } => 3,
config::ns::ClusterKind::SingleNode { .. } => 1,
}
}
}

pub struct S3Credentials {
Expand Down
7 changes: 6 additions & 1 deletion lib/bolt/core/src/context/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,12 @@ impl ServiceContextData {
env.insert("RIVET_PROFANITY_FILTER_DISABLE".into(), "1".into());
}

// Nomad
env.insert(
"NOMAD_SERVER_COUNT".into(),
project_ctx.nomad_server_count().to_string(),
);

if let Some(provisioning) = &project_ctx.ns().rivet.provisioning {
if self.depends_on_cluster_config() || matches!(run_context, RunContext::Test { .. }) {
env.insert(
Expand Down Expand Up @@ -1302,7 +1308,6 @@ impl ServiceContextData {

// if self.depends_on_infra() && project_ctx.ns().rivet.provisioning.is_some() {
let tls = terraform::output::read_tls(&project_ctx).await;
let k8s_infra = terraform::output::read_k8s_infra(&project_ctx).await;

env.insert(
"TLS_CERT_LOCALLY_SIGNED_JOB_CERT_PEM".into(),
Expand Down
1 change: 1 addition & 0 deletions lib/bolt/core/src/dep/terraform/gen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ async fn vars(ctx: &ProjectContext) {
}

// Edge nodes
vars.insert("nomad_server_count".into(), json!(ctx.nomad_server_count()));
vars.insert(
"edge_enabled".into(),
json!(config.rivet.provisioning.is_some()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ pub fn install() -> String {
include_str!("../files/nomad_install.sh").to_string()
}

pub fn configure() -> String {
let servers = &["127.0.0.1:5000", "127.0.0.1:5001", "127.0.0.1:5002"];
pub fn configure() -> GlobalResult<String> {
let nomad_server_count = util::env::var("NOMAD_SERVER_COUNT")?.parse::<usize>()?;
let servers = (0..nomad_server_count)
.map(|idx| format!("127.0.0.1:{}", 5000 + idx))
.collect::<Vec<_>>();

include_str!("../files/nomad_configure.sh")
Ok(include_str!("../files/nomad_configure.sh")
// HACK: Hardcoded to Linode
.replace("__PUBLIC_IFACE__", "eth0")
// HACK: Hardcoded to Linode
Expand All @@ -27,5 +30,5 @@ pub fn configure() -> String {
.replace(
"__ATS_VLAN_SUBNET__",
&util::net::ats::vlan_ip_net().to_string(),
)
))
}
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pub async fn gen_initialize(
// MARK: Specific pool components
match pool_type {
backend::cluster::PoolType::Job => {
script.push(components::nomad::configure());
script.push(components::nomad::configure()?);

prometheus_targets.insert(
"nomad".into(),
Expand Down

0 comments on commit 9c433d8

Please sign in to comment.