hashicorp · dadgar · Dec 9, 2015 · Dec 2, 2015 · Dec 2, 2015 · Dec 2, 2015
diff --git a/command/agent/command.go b/command/agent/command.go
@@ -42,6 +42,7 @@ type Command struct {
 	httpServer *HTTPServer
 	logFilter  *logutils.LevelFilter
 	logOutput  io.Writer
+	retryJoinErrCh chan struct{}
 
 	scadaProvider *scada.Provider
 	scadaHttp     *HTTPServer
@@ -71,6 +72,11 @@ func (c *Command) readConfig() *Config {
 
 	// Server-only options
 	flags.IntVar(&cmdConfig.Server.BootstrapExpect, "bootstrap-expect", 0, "")
+	flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "")
+	flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.StartJoin), "join", "")
+	flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.RetryJoin), "retry-join", "")
+	flags.IntVar(&cmdConfig.Server.RetryMaxAttempts, "retry-max", 0, "")
+	flags.StringVar(&cmdConfig.Server.RetryInterval, "retry-interval", "", "")
 
 	// Client-only options
 	flags.StringVar(&cmdConfig.Client.StateDir, "state-dir", "", "")
@@ -100,6 +106,15 @@ func (c *Command) readConfig() *Config {
 		return nil
 	}
 
+	if cmdConfig.Server.RetryInterval != "" {
+		dur, err := time.ParseDuration(cmdConfig.Server.RetryInterval)
+		if err != nil {
+			c.Ui.Error(fmt.Sprintf("Error parsing retry interval: %s", err))
+			return nil
+		}
+		cmdConfig.Server.retryInterval = dur
+	}
+
 	// Split the servers.
 	if servers != "" {
 		cmdConfig.Client.Servers = strings.Split(servers, ",")
@@ -358,6 +373,12 @@ func (c *Command) Run(args []string) int {
 		}
 	}()
 
+	// Join startup nodes if specified
+	if err := c.startupJoin(config); err != nil {
+		c.Ui.Error(err.Error())
+		return 1
+	}
+
 	// Compile agent information for output later
 	info := make(map[string]string)
 	info["client"] = strconv.FormatBool(config.Client.Enabled)
@@ -396,6 +417,10 @@ func (c *Command) Run(args []string) int {
 	// Enable log streaming
 	logGate.Flush()
 
+	// Start retry join process
+	c.retryJoinErrCh = make(chan struct{})
+	go c.retryJoin(config)
+
 	// Wait for exit
 	return c.handleSignals(config)
 }
@@ -413,6 +438,8 @@ WAIT:
 		sig = s
 	case <-c.ShutdownCh:
 		sig = os.Interrupt
+	case <-c.retryJoinErrCh:
+		return 1
 	}
 	c.Ui.Output(fmt.Sprintf("Caught signal: %v", sig))
 
@@ -559,6 +586,52 @@ func (c *Command) setupSCADA(config *Config) error {
 	return nil
 }
 
+func (c *Command) startupJoin(config *Config) error {
+	if len(config.Server.StartJoin) == 0 || !config.Server.Enabled {
+		return nil
+	}
+
+	c.Ui.Output("Joining cluster...")
+	n, err := c.agent.server.Join(config.Server.StartJoin)
+	if err != nil {
+		return err
+	}
+
+	c.Ui.Info(fmt.Sprintf("Join completed. Synced with %d initial agents", n))
+	return nil
+}
+
+// retryJoin is used to handle retrying a join until it succeeds or all retries
+// are exhausted.
+func (c *Command) retryJoin(config *Config) {
+	if len(config.Server.RetryJoin) == 0 || !config.Server.Enabled {
+		return
+	}
+
+	logger := c.agent.logger
+	logger.Printf("[INFO] agent: Joining cluster...")
+
+	attempt := 0
+	for {
+		n, err := c.agent.server.Join(config.Server.RetryJoin)
+		if err == nil {
+			logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n)
+			return
+		}
+
+		attempt++
+		if config.Server.RetryMaxAttempts > 0 && attempt > config.Server.RetryMaxAttempts {
+			logger.Printf("[ERROR] agent: max join retry exhausted, exiting")
+			close(c.retryJoinErrCh)
+			return
+		}
+
+		logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err,
+			config.Server.RetryInterval)
+		time.Sleep(config.Server.retryInterval)
+	}
+}
+
 func (c *Command) Synopsis() string {
 	return "Runs a Nomad agent"
 }
@@ -632,6 +705,24 @@ Server Options:
     bootstrapping the cluster. Once <num> servers have joined eachother,
     Nomad initiates the bootstrap process.
 
+  -join=<address>
+    Address of an agent to join at start time. Can be specified
+    multiple times.
+
+  -retry-join=<address>
+    Address of an agent to join at start time with retries enabled.
+    Can be specified multiple times.
+
+  -retry-max=<num>
+    Maximum number of join attempts. Defaults to 0, which will retry
+    indefinitely.
+
+  -retry-interval=<dur>
+    Time to wait between join attempts.
+
+  -rejoin
+    Ignore a previous leave and attempts to rejoin the cluster.
+
 Client Options:
 
   -client

diff --git a/command/agent/command_test.go b/command/agent/command_test.go
@@ -1,11 +1,14 @@
 package agent
 
 import (
+	"fmt"
 	"io/ioutil"
+	"log"
 	"os"
 	"strings"
 	"testing"
 
+	"github.com/hashicorp/nomad/testutil"
 	"github.com/mitchellh/cli"
 )
 
@@ -69,3 +72,58 @@ func TestCommand_Args(t *testing.T) {
 		}
 	}
 }
+
+func TestRetryJoin(t *testing.T) {
+	dir, agent := makeAgent(t, nil)
+	defer os.RemoveAll(dir)
+	defer agent.Shutdown()
+
+	tmpDir, err := ioutil.TempDir("", "nomad")
+	if err != nil {
+		t.Fatalf("err: %s", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	doneCh := make(chan struct{})
+	shutdownCh := make(chan struct{})
+
+	defer func() {
+		close(shutdownCh)
+		<-doneCh
+	}()
+
+	cmd := &Command{
+		ShutdownCh: shutdownCh,
+		Ui:         new(cli.MockUi),
+	}
+
+	serfAddr := fmt.Sprintf(
+		"%s:%d",
+		agent.config.BindAddr,
+		agent.config.Ports.Serf)
+
+	args := []string{
+		"-server",
+		"-data-dir", tmpDir,
+		"-node", fmt.Sprintf(`"Node %d"`, getPort()),
+		"-retry-join", serfAddr,
+		"-retry-interval", "1s",
+	}
+
+	go func() {
+		if code := cmd.Run(args); code != 0 {
+			log.Printf("bad: %d", code)
+		}
+		close(doneCh)
+	}()
+
+	testutil.WaitForResult(func() (bool, error) {
+		mem := agent.server.Members()
+		if len(mem) != 2 {
+			return false, fmt.Errorf("bad :%#v", mem)
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf(err.Error())
+	})
+}
diff --git a/command/agent/config.go b/command/agent/config.go
@@ -9,6 +9,7 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+	"time"
 
 	"github.com/hashicorp/hcl"
 	client "github.com/hashicorp/nomad/client/config"
@@ -181,6 +182,31 @@ type ServerConfig struct {
 
 	// NodeGCThreshold contros how "old" a node must be to be collected by GC.
 	NodeGCThreshold string `hcl:"node_gc_threshold"`
+
+	// StartJoin is a list of addresses to attempt to join when the
+	// agent starts. If Serf is unable to communicate with any of these
+	// addresses, then the agent will error and exit.
+	StartJoin []string `hcl:"start_join"`
+
+	// RetryJoin is a list of addresses to join with retry enabled.
+	RetryJoin []string `hcl:"retry_join"`
+
+	// RetryMaxAttempts specifies the maximum number of times to retry joining a
+	// host on startup. This is useful for cases where we know the node will be
+	// online eventually.
+	RetryMaxAttempts int `hcl:"retry_max"`
+
+	// RetryInterval specifies the amount of time to wait in between join
+	// attempts on agent start. The minimum allowed value is 1 second and
+	// the default is 30s.
+	RetryInterval string        `hcl:"retry_interval"`
+	retryInterval time.Duration `hcl:"-"`
+
+	// RejoinAfterLeave controls our interaction with the cluster after leave.
+	// When set to false (default), a leave causes Consul to not rejoin
+	// the cluster until an explicit join is received. If this is set to
+	// true, we ignore the leave, and rejoin the cluster on start.
+	RejoinAfterLeave bool `hcl:"rejoin_after_leave"`
 }
 
 // Telemetry is the telemetry configuration for the server
@@ -252,7 +278,11 @@ func DefaultConfig() *Config {
 			NetworkSpeed: 100,
 		},
 		Server: &ServerConfig{
-			Enabled: false,
+			Enabled:          false,
+			StartJoin:        []string{},
+			RetryJoin:        []string{},
+			RetryInterval:    "30s",
+			RetryMaxAttempts: 0,
 		},
 	}
 }
@@ -391,10 +421,30 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
 	if b.NodeGCThreshold != "" {
 		result.NodeGCThreshold = b.NodeGCThreshold
 	}
+	if b.RetryMaxAttempts != 0 {
+		result.RetryMaxAttempts = b.RetryMaxAttempts
+	}
+	if b.RetryInterval != "" {
+		result.RetryInterval = b.RetryInterval
+		result.retryInterval = b.retryInterval
+	}
+	if b.RejoinAfterLeave {
+		result.RejoinAfterLeave = true
+	}
 
 	// Add the schedulers
 	result.EnabledSchedulers = append(result.EnabledSchedulers, b.EnabledSchedulers...)
 
+	// Copy the start join addresses
+	result.StartJoin = make([]string, 0, len(a.StartJoin)+len(b.StartJoin))
+	result.StartJoin = append(result.StartJoin, a.StartJoin...)
+	result.StartJoin = append(result.StartJoin, b.StartJoin...)
+
+	// Copy the retry join addresses
+	result.RetryJoin = make([]string, 0, len(a.RetryJoin)+len(b.RetryJoin))
+	result.RetryJoin = append(result.RetryJoin, a.RetryJoin...)
+	result.RetryJoin = append(result.RetryJoin, b.RetryJoin...)
+
 	return &result
 }
 

diff --git a/command/agent/config_test.go b/command/agent/config_test.go
@@ -6,6 +6,7 @@ import (
 	"path/filepath"
 	"reflect"
 	"testing"
+	"time"
 
 	"github.com/hashicorp/nomad/nomad/structs"
 )
@@ -114,6 +115,11 @@ func TestConfig_Merge(t *testing.T) {
 			NumSchedulers:     2,
 			EnabledSchedulers: []string{structs.JobTypeBatch},
 			NodeGCThreshold:   "12h",
+			RejoinAfterLeave:  true,
+			StartJoin:         []string{"1.1.1.1"},
+			RetryJoin:         []string{"1.1.1.1"},
+			RetryInterval:     "10s",
+			retryInterval:     time.Second * 10,
 		},
 		Ports: &Ports{
 			HTTP: 20000,
@@ -384,6 +390,11 @@ func TestConfig_LoadConfigString(t *testing.T) {
 			NumSchedulers:     2,
 			EnabledSchedulers: []string{"test"},
 			NodeGCThreshold:   "12h",
+			RetryJoin:         []string{"1.1.1.1", "2.2.2.2"},
+			StartJoin:         []string{"1.1.1.1", "2.2.2.2"},
+			RetryInterval:     "15s",
+			RejoinAfterLeave:  true,
+			RetryMaxAttempts:  3,
 		},
 		Telemetry: &Telemetry{
 			StatsiteAddr:    "127.0.0.1:1234",
@@ -457,6 +468,11 @@ server {
 	num_schedulers = 2
 	enabled_schedulers = ["test"]
 	node_gc_threshold = "12h"
+	retry_join = [ "1.1.1.1", "2.2.2.2" ]
+	start_join = [ "1.1.1.1", "2.2.2.2" ]
+	retry_max = 3
+	retry_interval = "15s"
+	rejoin_after_leave = true
 }
 telemetry {
 	statsite_address = "127.0.0.1:1234"

diff --git a/website/source/docs/agent/config.html.md b/website/source/docs/agent/config.html.md
@@ -177,6 +177,21 @@ configured on client nodes.
     "1.5h" or "25m". Valid time units are "ns", "us" (or "µs"), "ms", "s",
     "m", "h". Controls how long a node must be in a terminal state before it is
     garbage collected and purged from the system.
+  * <a id="rejoin_after_leave">`rejoin_after_leave`</a> When provided, Nomad will ignore a previous leave and
+    attempt to rejoin the cluster when starting. By default, Nomad treats leave
+    as a permanent intent and does not attempt to join the cluster again when
+    starting. This flag allows the previous state to be used to rejoin the
+    cluster.
+  * <a id="retry_join">`retry_join`</a> Similar to [`start_join`](#start_join) but allows retrying a join
+    if the first attempt fails. This is useful for cases where we know the
+    address will become available eventually.
+  * <a id="retry_interval">`retry_interval`</a> The time to wait between join attempts. Defaults to 30s.
+  * <a id="retry_max">`retry_max`</a> The maximum number of join attempts to be made before exiting
+    with a return code of 1. By default, this is set to 0 which is interpreted
+    as infinite retries.
+  * <a id="start_join">`start_join`</a> An array of strings specifying addresses of nodes to join upon startup.
+    If Nomad is unable to join with any of the specified addresses, agent startup will
+    fail. By default, the agent won't join any nodes when it starts up.
 
 ## Client-specific Options
 
@@ -301,6 +316,8 @@ via CLI arguments. The `agent` command accepts the following arguments:
 * `-dev`: Start the agent in development mode. This enables a pre-configured
   dual-role agent (client + server) which is useful for developing or testing
   Nomad. No other configuration is required to start the agent in this mode.
+* `-join=<address>`: Address of another agent to join upon starting up. This can
+  be specified multiple times to specify multiple agents to join.
 * `-log-level=<level>`: Equivalent to the [log_level](#log_level) config option.
 * `-meta=<key=value>`: Equivalent to the Client [meta](#meta) config option.
 * `-network-interface<interface>`: Equivalent to the Client
@@ -312,6 +329,10 @@ via CLI arguments. The `agent` command accepts the following arguments:
   config option.
 * `-node-id=<uuid>`: Equivalent to the Client [node_id](#node_id) config option.
 * `-region=<region>`: Equivalent to the [region](#region) config option.
+* `-rejoin`: Equivalent to the [rejoin_after_leave](#rejoin_after_leave) config option.
+* `-retry-interval`: Equivalent to the [retry_interval](#retry_interval) config option.
+* `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails.
+* `-retry-max`: Similar to the [retry_max](#retry_max) config option.
 * `-server`: Enable server mode on the local agent.
 * `-servers=<host:port>`: Equivalent to the Client [servers](#servers) config
   option.