Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cluster join command line options and configuration options #527

Merged
merged 9 commits into from
Dec 9, 2015
91 changes: 91 additions & 0 deletions command/agent/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ type Command struct {
httpServer *HTTPServer
logFilter *logutils.LevelFilter
logOutput io.Writer
retryJoinErrCh chan struct{}

scadaProvider *scada.Provider
scadaHttp *HTTPServer
Expand Down Expand Up @@ -71,6 +72,11 @@ func (c *Command) readConfig() *Config {

// Server-only options
flags.IntVar(&cmdConfig.Server.BootstrapExpect, "bootstrap-expect", 0, "")
flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "")
flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.StartJoin), "join", "")
flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.RetryJoin), "retry-join", "")
flags.IntVar(&cmdConfig.Server.RetryMaxAttempts, "retry-max", 0, "")
flags.StringVar(&cmdConfig.Server.RetryInterval, "retry-interval", "", "")

// Client-only options
flags.StringVar(&cmdConfig.Client.StateDir, "state-dir", "", "")
Expand Down Expand Up @@ -100,6 +106,15 @@ func (c *Command) readConfig() *Config {
return nil
}

if cmdConfig.Server.RetryInterval != "" {
dur, err := time.ParseDuration(cmdConfig.Server.RetryInterval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing retry interval: %s", err))
return nil
}
cmdConfig.Server.retryInterval = dur
}

// Split the servers.
if servers != "" {
cmdConfig.Client.Servers = strings.Split(servers, ",")
Expand Down Expand Up @@ -358,6 +373,12 @@ func (c *Command) Run(args []string) int {
}
}()

// Join startup nodes if specified
if err := c.startupJoin(config); err != nil {
c.Ui.Error(err.Error())
return 1
}

// Compile agent information for output later
info := make(map[string]string)
info["client"] = strconv.FormatBool(config.Client.Enabled)
Expand Down Expand Up @@ -396,6 +417,10 @@ func (c *Command) Run(args []string) int {
// Enable log streaming
logGate.Flush()

// Start retry join process
c.retryJoinErrCh = make(chan struct{})
go c.retryJoin(config)

// Wait for exit
return c.handleSignals(config)
}
Expand All @@ -413,6 +438,8 @@ WAIT:
sig = s
case <-c.ShutdownCh:
sig = os.Interrupt
case <-c.retryJoinErrCh:
return 1
}
c.Ui.Output(fmt.Sprintf("Caught signal: %v", sig))

Expand Down Expand Up @@ -559,6 +586,52 @@ func (c *Command) setupSCADA(config *Config) error {
return nil
}

func (c *Command) startupJoin(config *Config) error {
if len(config.Server.StartJoin) == 0 || !config.Server.Enabled {
return nil
}

c.Ui.Output("Joining cluster...")
n, err := c.agent.server.Join(config.Server.StartJoin)
if err != nil {
return err
}

c.Ui.Info(fmt.Sprintf("Join completed. Synced with %d initial agents", n))
return nil
}

// retryJoin is used to handle retrying a join until it succeeds or all retries
// are exhausted.
func (c *Command) retryJoin(config *Config) {
if len(config.Server.RetryJoin) == 0 || !config.Server.Enabled {
return
}

logger := c.agent.logger
logger.Printf("[INFO] agent: Joining cluster...")

attempt := 0
for {
n, err := c.agent.server.Join(config.Server.RetryJoin)
if err == nil {
logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n)
return
}

attempt++
if config.Server.RetryMaxAttempts > 0 && attempt > config.Server.RetryMaxAttempts {
logger.Printf("[ERROR] agent: max join retry exhausted, exiting")
close(c.retryJoinErrCh)
return
}

logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err,
config.Server.RetryInterval)
time.Sleep(config.Server.retryInterval)
}
}

func (c *Command) Synopsis() string {
return "Runs a Nomad agent"
}
Expand Down Expand Up @@ -632,6 +705,24 @@ Server Options:
bootstrapping the cluster. Once <num> servers have joined eachother,
Nomad initiates the bootstrap process.

-join=<address>
Address of an agent to join at start time. Can be specified
multiple times.

-retry-join=<address>
Address of an agent to join at start time with retries enabled.
Can be specified multiple times.

-retry-max=<num>
Maximum number of join attempts. Defaults to 0, which will retry
indefinitely.

-retry-interval=<dur>
Time to wait between join attempts.

-rejoin
Ignore a previous leave and attempts to rejoin the cluster.

Client Options:

-client
Expand Down
58 changes: 58 additions & 0 deletions command/agent/command_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package agent

import (
"fmt"
"io/ioutil"
"log"
"os"
"strings"
"testing"

"github.com/hashicorp/nomad/testutil"
"github.com/mitchellh/cli"
)

Expand Down Expand Up @@ -69,3 +72,58 @@ func TestCommand_Args(t *testing.T) {
}
}
}

func TestRetryJoin(t *testing.T) {
dir, agent := makeAgent(t, nil)
defer os.RemoveAll(dir)
defer agent.Shutdown()

tmpDir, err := ioutil.TempDir("", "nomad")
if err != nil {
t.Fatalf("err: %s", err)
}
defer os.RemoveAll(tmpDir)

doneCh := make(chan struct{})
shutdownCh := make(chan struct{})

defer func() {
close(shutdownCh)
<-doneCh
}()

cmd := &Command{
ShutdownCh: shutdownCh,
Ui: new(cli.MockUi),
}

serfAddr := fmt.Sprintf(
"%s:%d",
agent.config.BindAddr,
agent.config.Ports.Serf)

args := []string{
"-server",
"-data-dir", tmpDir,
"-node", fmt.Sprintf(`"Node %d"`, getPort()),
"-retry-join", serfAddr,
"-retry-interval", "1s",
}

go func() {
if code := cmd.Run(args); code != 0 {
log.Printf("bad: %d", code)
}
close(doneCh)
}()

testutil.WaitForResult(func() (bool, error) {
mem := agent.server.Members()
if len(mem) != 2 {
return false, fmt.Errorf("bad :%#v", mem)
}
return true, nil
}, func(err error) {
t.Fatalf(err.Error())
})
}
52 changes: 51 additions & 1 deletion command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"path/filepath"
"runtime"
"strings"
"time"

"github.com/hashicorp/hcl"
client "github.com/hashicorp/nomad/client/config"
Expand Down Expand Up @@ -181,6 +182,31 @@ type ServerConfig struct {

// NodeGCThreshold contros how "old" a node must be to be collected by GC.
NodeGCThreshold string `hcl:"node_gc_threshold"`

// StartJoin is a list of addresses to attempt to join when the
// agent starts. If Serf is unable to communicate with any of these
// addresses, then the agent will error and exit.
StartJoin []string `hcl:"start_join"`

// RetryJoin is a list of addresses to join with retry enabled.
RetryJoin []string `hcl:"retry_join"`

// RetryMaxAttempts specifies the maximum number of times to retry joining a
// host on startup. This is useful for cases where we know the node will be
// online eventually.
RetryMaxAttempts int `hcl:"retry_max"`

// RetryInterval specifies the amount of time to wait in between join
// attempts on agent start. The minimum allowed value is 1 second and
// the default is 30s.
RetryInterval string `hcl:"retry_interval"`
retryInterval time.Duration `hcl:"-"`

// RejoinAfterLeave controls our interaction with the cluster after leave.
// When set to false (default), a leave causes Consul to not rejoin
// the cluster until an explicit join is received. If this is set to
// true, we ignore the leave, and rejoin the cluster on start.
RejoinAfterLeave bool `hcl:"rejoin_after_leave"`
}

// Telemetry is the telemetry configuration for the server
Expand Down Expand Up @@ -252,7 +278,11 @@ func DefaultConfig() *Config {
NetworkSpeed: 100,
},
Server: &ServerConfig{
Enabled: false,
Enabled: false,
StartJoin: []string{},
RetryJoin: []string{},
RetryInterval: "30s",
RetryMaxAttempts: 0,
},
}
}
Expand Down Expand Up @@ -391,10 +421,30 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
if b.NodeGCThreshold != "" {
result.NodeGCThreshold = b.NodeGCThreshold
}
if b.RetryMaxAttempts != 0 {
result.RetryMaxAttempts = b.RetryMaxAttempts
}
if b.RetryInterval != "" {
result.RetryInterval = b.RetryInterval
result.retryInterval = b.retryInterval
}
if b.RejoinAfterLeave {
result.RejoinAfterLeave = true
}

// Add the schedulers
result.EnabledSchedulers = append(result.EnabledSchedulers, b.EnabledSchedulers...)

// Copy the start join addresses
result.StartJoin = make([]string, 0, len(a.StartJoin)+len(b.StartJoin))
result.StartJoin = append(result.StartJoin, a.StartJoin...)
result.StartJoin = append(result.StartJoin, b.StartJoin...)

// Copy the retry join addresses
result.RetryJoin = make([]string, 0, len(a.RetryJoin)+len(b.RetryJoin))
result.RetryJoin = append(result.RetryJoin, a.RetryJoin...)
result.RetryJoin = append(result.RetryJoin, b.RetryJoin...)

return &result
}

Expand Down
16 changes: 16 additions & 0 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"path/filepath"
"reflect"
"testing"
"time"

"github.com/hashicorp/nomad/nomad/structs"
)
Expand Down Expand Up @@ -114,6 +115,11 @@ func TestConfig_Merge(t *testing.T) {
NumSchedulers: 2,
EnabledSchedulers: []string{structs.JobTypeBatch},
NodeGCThreshold: "12h",
RejoinAfterLeave: true,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a test case that runs without any of these options set?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only tests that use these options are TestConfig_Merge, TestConfig_LoadConfigString, and TestRetryJoin. Is there a specific test case that you are thinking of?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a specific test case that you are thinking of?

No, I just like to make sure we test the default case where none of the new options is specified so we don't get a regression at some point. If there is not an existing test case that covers this we should add a new one. I asked because it's hard to see this from the diff; I will look more closely later.

StartJoin: []string{"1.1.1.1"},
RetryJoin: []string{"1.1.1.1"},
RetryInterval: "10s",
retryInterval: time.Second * 10,
},
Ports: &Ports{
HTTP: 20000,
Expand Down Expand Up @@ -384,6 +390,11 @@ func TestConfig_LoadConfigString(t *testing.T) {
NumSchedulers: 2,
EnabledSchedulers: []string{"test"},
NodeGCThreshold: "12h",
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
StartJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: "15s",
RejoinAfterLeave: true,
RetryMaxAttempts: 3,
},
Telemetry: &Telemetry{
StatsiteAddr: "127.0.0.1:1234",
Expand Down Expand Up @@ -457,6 +468,11 @@ server {
num_schedulers = 2
enabled_schedulers = ["test"]
node_gc_threshold = "12h"
retry_join = [ "1.1.1.1", "2.2.2.2" ]
start_join = [ "1.1.1.1", "2.2.2.2" ]
retry_max = 3
retry_interval = "15s"
rejoin_after_leave = true
}
telemetry {
statsite_address = "127.0.0.1:1234"
Expand Down
21 changes: 21 additions & 0 deletions website/source/docs/agent/config.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,21 @@ configured on client nodes.
"1.5h" or "25m". Valid time units are "ns", "us" (or "µs"), "ms", "s",
"m", "h". Controls how long a node must be in a terminal state before it is
garbage collected and purged from the system.
* <a id="rejoin_after_leave">`rejoin_after_leave`</a> When provided, Nomad will ignore a previous leave and
attempt to rejoin the cluster when starting. By default, Nomad treats leave
as a permanent intent and does not attempt to join the cluster again when
starting. This flag allows the previous state to be used to rejoin the
cluster.
* <a id="retry_join">`retry_join`</a> Similar to [`start_join`](#start_join) but allows retrying a join
if the first attempt fails. This is useful for cases where we know the
address will become available eventually.
* <a id="retry_interval">`retry_interval`</a> The time to wait between join attempts. Defaults to 30s.
* <a id="retry_max">`retry_max`</a> The maximum number of join attempts to be made before exiting
with a return code of 1. By default, this is set to 0 which is interpreted
as infinite retries.
* <a id="start_join">`start_join`</a> An array of strings specifying addresses of nodes to join upon startup.
If Nomad is unable to join with any of the specified addresses, agent startup will
fail. By default, the agent won't join any nodes when it starts up.

## Client-specific Options

Expand Down Expand Up @@ -301,6 +316,8 @@ via CLI arguments. The `agent` command accepts the following arguments:
* `-dev`: Start the agent in development mode. This enables a pre-configured
dual-role agent (client + server) which is useful for developing or testing
Nomad. No other configuration is required to start the agent in this mode.
* `-join=<address>`: Address of another agent to join upon starting up. This can
be specified multiple times to specify multiple agents to join.
* `-log-level=<level>`: Equivalent to the [log_level](#log_level) config option.
* `-meta=<key=value>`: Equivalent to the Client [meta](#meta) config option.
* `-network-interface<interface>`: Equivalent to the Client
Expand All @@ -312,6 +329,10 @@ via CLI arguments. The `agent` command accepts the following arguments:
config option.
* `-node-id=<uuid>`: Equivalent to the Client [node_id](#node_id) config option.
* `-region=<region>`: Equivalent to the [region](#region) config option.
* `-rejoin`: Equivalent to the [rejoin_after_leave](#rejoin_after_leave) config option.
* `-retry-interval`: Equivalent to the [retry_interval](#retry_interval) config option.
* `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails.
* `-retry-max`: Similar to the [retry_max](#retry_max) config option.
* `-server`: Enable server mode on the local agent.
* `-servers=<host:port>`: Equivalent to the Client [servers](#servers) config
option.
Expand Down