Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow operators to opt into publishing node and alloc metrics #1501

Merged
merged 2 commits into from
Aug 4, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## 0.4.1 (UNRELEASED)

__BACKWARDS INCOMPATIBILITIES:__
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would make it default as true and let people who want to disable it disable it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are keeping the default false since the volume of metrics Nomad emits is quite large.

* telemetry: Operators will have to explicitly opt-in for Nomad client to
publish allocation and node metrics

IMPROVEMENTS:
* core: Allow count 0 on system jobs [GH-1421]
* core: Gracefully handle short lived outages by holding RPC calls [GH-1403]
Expand All @@ -11,6 +15,7 @@ IMPROVEMENTS:
* client: Add killing event to task state [GH-1457]
* client: Fingerprint network speed on Windows [GH-1443]
* telemetry: Circonus integration for telemetry metrics [GH-1459]
* telemetry: Allow operators to opt-in for publishing metrics [GH-1501]

BUG FIXES:
* core: Sanitize empty slices/maps in jobs to avoid incorrect create/destroy
Expand Down
6 changes: 5 additions & 1 deletion client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1397,7 +1397,11 @@ func (c *Client) collectHostStats() {
c.resourceUsageLock.Lock()
c.resourceUsage = ru
c.resourceUsageLock.Unlock()
c.emitStats(ru)

// Publish Node metrics if operator has opted in
if c.config.PublishNodeMetrics {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like you check the config settings inside the function. Are you sure you need the guard up here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@slackpad We do that in the emitMetrics method of task runners, this is for the emitMetrics method of the Client which sends host related metrics.

c.emitStats(ru)
}
case <-c.shutdownCh:
return
}
Expand Down
8 changes: 8 additions & 0 deletions client/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@ type Config struct {
// StatsCollectionInterval is the interval at which the Nomad client
// collects resource usage stats
StatsCollectionInterval time.Duration

// PublishNodeMetrics determines whether nomad is going to publish node
// level metrics to remote Telemetry sinks
PublishNodeMetrics bool

// PublishAllocationMetrics determines whether nomad is going to publish
// allocation metrics to remote Telemetry sinks
PublishAllocationMetrics bool
}

func (c *Config) Copy() *Config {
Expand Down
6 changes: 2 additions & 4 deletions client/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ func (r *TaskRunner) Destroy() {
// emitStats emits resource usage stats of tasks to remote metrics collector
// sinks
func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
if ru.ResourceUsage.MemoryStats != nil {
if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
Expand All @@ -649,14 +649,12 @@ func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
}

if ru.ResourceUsage.CpuStats != nil {
if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
}

//TODO Add Pid stats when we add an API to enable/disable them
}
2 changes: 2 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) {

conf.ConsulConfig = a.config.Consul
conf.StatsCollectionInterval = a.config.Telemetry.collectionInterval
conf.PublishNodeMetrics = a.config.Telemetry.PublishNodeMetrics
conf.PublishAllocationMetrics = a.config.Telemetry.PublishAllocationMetrics
return conf, nil
}

Expand Down
2 changes: 2 additions & 0 deletions command/agent/config-test-fixtures/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ telemetry {
statsd_address = "127.0.0.1:2345"
disable_hostname = true
collection_interval = "3s"
publish_allocation_metrics = true
publish_node_metrics = true
}
leave_on_interrupt = true
leave_on_terminate = true
Expand Down
12 changes: 7 additions & 5 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,11 +240,13 @@ type ServerConfig struct {

// Telemetry is the telemetry configuration for the server
type Telemetry struct {
StatsiteAddr string `mapstructure:"statsite_address"`
StatsdAddr string `mapstructure:"statsd_address"`
DisableHostname bool `mapstructure:"disable_hostname"`
CollectionInterval string `mapstructure:"collection_interval"`
collectionInterval time.Duration `mapstructure:"-"`
StatsiteAddr string `mapstructure:"statsite_address"`
StatsdAddr string `mapstructure:"statsd_address"`
DisableHostname bool `mapstructure:"disable_hostname"`
CollectionInterval string `mapstructure:"collection_interval"`
collectionInterval time.Duration `mapstructure:"-"`
PublishAllocationMetrics bool `mapstructure:"publish_allocation_metrics"`
PublishNodeMetrics bool `mapstructure:"publish_node_metrics"`

// Circonus: see https://github.com/circonus-labs/circonus-gometrics
// for more details on the various configuration options.
Expand Down
2 changes: 2 additions & 0 deletions command/agent/config_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,8 @@ func parseTelemetry(result **Telemetry, list *ast.ObjectList) error {
"statsd_address",
"disable_hostname",
"collection_interval",
"publish_allocation_metrics",
"publish_node_metrics",
"circonus_api_token",
"circonus_api_app",
"circonus_api_url",
Expand Down
12 changes: 7 additions & 5 deletions command/agent/config_parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,13 @@ func TestConfig_Parse(t *testing.T) {
RetryMaxAttempts: 3,
},
Telemetry: &Telemetry{
StatsiteAddr: "127.0.0.1:1234",
StatsdAddr: "127.0.0.1:2345",
DisableHostname: true,
CollectionInterval: "3s",
collectionInterval: 3 * time.Second,
StatsiteAddr: "127.0.0.1:1234",
StatsdAddr: "127.0.0.1:2345",
DisableHostname: true,
CollectionInterval: "3s",
collectionInterval: 3 * time.Second,
PublishAllocationMetrics: true,
PublishNodeMetrics: true,
},
LeaveOnInt: true,
LeaveOnTerm: true,
Expand Down