From 72f6e4a43ae97545d7b6320690321d0be8d01f71 Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 1 Sep 2022 16:56:56 -0700 Subject: [PATCH 1/7] Lightstep fork of go-contrib instrumentation/host metrics --- lightstep/instrumentation/hostprocess/doc.go | 43 +++ .../hostprocess/hostprocess.go | 327 ++++++++++++++++++ .../hostprocess/hostprocess_test.go | 307 ++++++++++++++++ 3 files changed, 677 insertions(+) create mode 100644 lightstep/instrumentation/hostprocess/doc.go create mode 100644 lightstep/instrumentation/hostprocess/hostprocess.go create mode 100644 lightstep/instrumentation/hostprocess/hostprocess_test.go diff --git a/lightstep/instrumentation/hostprocess/doc.go b/lightstep/instrumentation/hostprocess/doc.go new file mode 100644 index 00000000..fbd4bb5b --- /dev/null +++ b/lightstep/instrumentation/hostprocess/doc.go @@ -0,0 +1,43 @@ +// Copyright The OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hostprocess provides the conventional host metrics +// specified by OpenTelemetry. Host metric events are sometimes +// collected through the OpenTelemetry Collector "hostmetrics" +// receiver running as an agent; this instrumentation is an +// alternative for processes that want to record the same information +// without an agent. +// +// The metric events produced are listed here with attribute dimensions. +// +// Name Attribute +// +// ---------------------------------------------------------------------- +// +// process.cpu.time state=user|system +// system.cpu.time state=user|system|other|idle +// system.memory.usage state=used|available +// system.memory.utilization state=used|available +// system.network.io direction=transmit|receive +// +// These are runtime metrics that are not currently provided by the +// runtime/metrics package: +// +// process.runtime.uptime +// process.runtime.go.gc.cpu.time (see https://github.com/open-telemetry/opentelemetry-go-contrib/issues/316) +// +// See https://github.com/open-telemetry/oteps/blob/main/text/0119-standard-system-metrics.md +// for the definition of these metric instruments. + +package hostprocess // import "github.com/lightstep/otel-launcher-go/lightstep/instrumentation/hostprocess" diff --git a/lightstep/instrumentation/hostprocess/hostprocess.go b/lightstep/instrumentation/hostprocess/hostprocess.go new file mode 100644 index 00000000..86ab25b4 --- /dev/null +++ b/lightstep/instrumentation/hostprocess/hostprocess.go @@ -0,0 +1,327 @@ +// Copyright The OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostprocess // import "github.com/lightstep/otel-launcher-go/lightstep/instrumentation/hostprocess" + +import ( + "context" + "fmt" + "math" + "runtime" + "sync" + "syscall" + "time" + + "github.com/shirou/gopsutil/v3/cpu" + "github.com/shirou/gopsutil/v3/mem" + "github.com/shirou/gopsutil/v3/net" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/global" + "go.opentelemetry.io/otel/metric/instrument" + "go.opentelemetry.io/otel/metric/instrument/asyncfloat64" + "go.opentelemetry.io/otel/metric/instrument/asyncint64" + "go.opentelemetry.io/otel/metric/unit" +) + +// processStartTime should be initialized before the first GC, ideally. +var processStartTime = time.Now() + +// Host reports the work-in-progress conventional host metrics specified by OpenTelemetry. +type host struct { + meter metric.Meter +} + +// config contains optional settings for reporting host metrics. +type config struct { + // MeterProvider sets the metric.MeterProvider. If nil, the global + // Provider will be used. + MeterProvider metric.MeterProvider +} + +// Option supports configuring optional settings for host metrics. +type Option interface { + apply(*config) +} + +// WithMeterProvider sets the Metric implementation to use for +// reporting. If this option is not used, the global metric.MeterProvider +// will be used. `provider` must be non-nil. +func WithMeterProvider(provider metric.MeterProvider) Option { + return metricProviderOption{provider} +} + +type metricProviderOption struct{ metric.MeterProvider } + +func (o metricProviderOption) apply(c *config) { + if o.MeterProvider != nil { + c.MeterProvider = o.MeterProvider + } +} + +// Attribute sets. +var ( + // Attribute sets for CPU time measurements. + + AttributeCPUTimeUser = []attribute.KeyValue{attribute.String("state", "user")} + AttributeCPUTimeSystem = []attribute.KeyValue{attribute.String("state", "system")} + AttributeCPUTimeOther = []attribute.KeyValue{attribute.String("state", "other")} + AttributeCPUTimeIdle = []attribute.KeyValue{attribute.String("state", "idle")} + + // Attribute sets used for Memory measurements. + + AttributeMemoryAvailable = []attribute.KeyValue{attribute.String("state", "available")} + AttributeMemoryUsed = []attribute.KeyValue{attribute.String("state", "used")} + + // Attribute sets used for Network measurements. + + AttributeNetworkTransmit = []attribute.KeyValue{attribute.String("direction", "transmit")} + AttributeNetworkReceive = []attribute.KeyValue{attribute.String("direction", "receive")} +) + +// newConfig computes a config from a list of Options. +func newConfig(opts ...Option) config { + c := config{ + MeterProvider: global.MeterProvider(), + } + for _, opt := range opts { + opt.apply(&c) + } + return c +} + +// Start initializes reporting of host metrics using the supplied config. +func Start(opts ...Option) error { + c := newConfig(opts...) + if c.MeterProvider == nil { + c.MeterProvider = global.MeterProvider() + } + h := newHost(c) + return h.register() +} + +func newHost(c config) *host { + return &host{ + meter: c.MeterProvider.Meter("otel_launcher_go/host"), + } +} + +func (h *host) register() error { + var ( + err error + + processCPUTime asyncfloat64.Counter + processUptime asyncfloat64.UpDownCounter + processGCCPUTime asyncfloat64.Counter + hostCPUTime asyncfloat64.Counter + + hostMemoryUsage asyncint64.Gauge + hostMemoryUtilization asyncfloat64.Gauge + + networkIOUsage asyncint64.Counter + + // lock prevents a race between batch observer and instrument registration. + lock sync.Mutex + ) + + lock.Lock() + defer lock.Unlock() + + if processCPUTime, err = h.meter.AsyncFloat64().Counter( + "process.cpu.time", + instrument.WithUnit("s"), + instrument.WithDescription( + "Accumulated CPU time spent by this process attributed by state (User, System, ...)", + ), + ); err != nil { + return err + } + + if hostCPUTime, err = h.meter.AsyncFloat64().Counter( + "system.cpu.time", + instrument.WithUnit("s"), + instrument.WithDescription( + "Accumulated CPU time spent by this process host attributed by state (User, System, Other, Idle)", + ), + ); err != nil { + return err + } + + if hostMemoryUsage, err = h.meter.AsyncInt64().UpDownCounter( + "system.memory.usage", + instrument.WithUnit(unit.Bytes), + instrument.WithDescription( + "Memory usage of this process host attributed by memory state (Used, Available)", + ), + ); err != nil { + return err + } + + if hostMemoryUtilization, err = h.meter.AsyncFloat64().Gauge( + "system.memory.utilization", + instrument.WithDescription( + "Memory utilization of this process host attributed by memory state (Used, Available)", + ), + ); err != nil { + return err + } + + if networkIOUsage, err = h.meter.AsyncInt64().Counter( + "system.network.io", + instrument.WithUnit(unit.Bytes), + instrument.WithDescription( + "Bytes transferred attributed by direction (Transmit, Receive)", + ), + ); err != nil { + return err + } + + if processUptime, err = h.meter.AsyncFloat64().UpDownCounter( + "process.uptime", + instrument.WithUnit("s"), + instrument.WithDescription("Seconds since application was initialized"), + ); err != nil { + return err + } + + if processGCCPUTime, err = h.meter.AsyncFloat64().UpDownCounter( + // Note: this name is selected so that if Go's runtime/metrics package + // were to start generating this it would be named /gc/cpu/time:seconds (float64). + "process.runtime.go.gc.cpu.time", + instrument.WithUnit("s"), + instrument.WithDescription("Seconds of garbage collection since application was initialized"), + ); err != nil { + return err + } + + err = h.meter.RegisterCallback( + []instrument.Asynchronous{ + processCPUTime, + hostCPUTime, + hostMemoryUsage, + hostMemoryUtilization, + networkIOUsage, + }, + func(ctx context.Context) { + lock.Lock() + defer lock.Unlock() + + processUser, processSystem, processGC, uptime := h.getProcessTimes() + + hostTimeSlice, err := cpu.TimesWithContext(ctx, false) + if err != nil { + otel.Handle(err) + return + } + if len(hostTimeSlice) != 1 { + otel.Handle(fmt.Errorf("host CPU usage: incorrect summary count")) + return + } + + vmStats, err := mem.VirtualMemoryWithContext(ctx) + if err != nil { + otel.Handle(err) + return + } + + ioStats, err := net.IOCountersWithContext(ctx, false) + if err != nil { + otel.Handle(err) + return + } + if len(ioStats) != 1 { + otel.Handle(fmt.Errorf("host network usage: incorrect summary count")) + return + } + + // Uptime + processUptime.Observe(ctx, uptime) + + // Process CPU time + processCPUTime.Observe(ctx, processUser, AttributeCPUTimeUser...) + processCPUTime.Observe(ctx, processSystem, AttributeCPUTimeSystem...) + + // Process GC CPU time + processGCCPUTime.Observe(ctx, processGC) + + // Host CPU time + hostTime := hostTimeSlice[0] + hostCPUTime.Observe(ctx, hostTime.User, AttributeCPUTimeUser...) + hostCPUTime.Observe(ctx, hostTime.System, AttributeCPUTimeSystem...) + + // Note: "other" is the sum of all other known states. + other := hostTime.Nice + + hostTime.Iowait + + hostTime.Irq + + hostTime.Softirq + + hostTime.Steal + + hostTime.Guest + + hostTime.GuestNice + + hostCPUTime.Observe(ctx, other, AttributeCPUTimeOther...) + hostCPUTime.Observe(ctx, hostTime.Idle, AttributeCPUTimeIdle...) + + // Host memory usage + hostMemoryUsage.Observe(ctx, int64(vmStats.Used), AttributeMemoryUsed...) + hostMemoryUsage.Observe(ctx, int64(vmStats.Available), AttributeMemoryAvailable...) + + // Host memory utilization + hostMemoryUtilization.Observe(ctx, float64(vmStats.Used)/float64(vmStats.Total), AttributeMemoryUsed...) + hostMemoryUtilization.Observe(ctx, float64(vmStats.Available)/float64(vmStats.Total), AttributeMemoryAvailable...) + + // Host network usage + networkIOUsage.Observe(ctx, int64(ioStats[0].BytesSent), AttributeNetworkTransmit...) + networkIOUsage.Observe(ctx, int64(ioStats[0].BytesRecv), AttributeNetworkReceive...) + }) + + if err != nil { + return err + } + + return nil +} + +// getProcessTimes is called with the lock. Calls ReadMemStats() for +// GCCPUFraction because as of Go-1.19 there is no such runtime +// metric. User and system sum to 100% of CPU time (counter); gc is +// an independent, comparable metric value. These are correlated with uptime. +func (h *host) getProcessTimes() (userSeconds, systemSeconds, gcSeconds, uptimeSeconds float64) { + // Would really be better if runtime/metrics exposed this, + // making an expensive call for a single field that is not + // exposed via ReadMemStats(). + var memStats runtime.MemStats + runtime.ReadMemStats(&memStats) + + gomaxprocs := float64(runtime.GOMAXPROCS(0)) + + uptimeSeconds = time.Since(processStartTime).Seconds() + gcSeconds = memStats.GCCPUFraction * uptimeSeconds * gomaxprocs + + var ru syscall.Rusage + if err := syscall.Getrusage(syscall.RUSAGE_SELF, &ru); err != nil { + userSeconds = math.NaN() + systemSeconds = math.NaN() + otel.Handle(fmt.Errorf("getrusage: %w", err)) + return + } + + utime := time.Duration(ru.Utime.Sec)*time.Second + time.Duration(ru.Utime.Usec)*time.Microsecond + stime := time.Duration(ru.Stime.Sec)*time.Second + time.Duration(ru.Stime.Usec)*time.Microsecond + + userSeconds = utime.Seconds() + systemSeconds = stime.Seconds() + return +} diff --git a/lightstep/instrumentation/hostprocess/hostprocess_test.go b/lightstep/instrumentation/hostprocess/hostprocess_test.go new file mode 100644 index 00000000..c3706f81 --- /dev/null +++ b/lightstep/instrumentation/hostprocess/hostprocess_test.go @@ -0,0 +1,307 @@ +// Copyright The OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostprocess + +import ( + "context" + "fmt" + gonet "net" + "os" + "runtime" + "testing" + "time" + + "github.com/shirou/gopsutil/v3/cpu" + "github.com/shirou/gopsutil/v3/mem" + "github.com/shirou/gopsutil/v3/net" + "github.com/shirou/gopsutil/v3/process" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/sdk/metric/export/aggregation" + "go.opentelemetry.io/otel/sdk/metric/metrictest" +) + +func getMetric(exp *metrictest.Exporter, name string, lbl attribute.KeyValue) float64 { + for _, r := range exp.GetRecords() { + if r.InstrumentName != name { + continue + } + + if lbl.Key != "" { + foundAttribute := false + for _, haveLabel := range r.Attributes { + if haveLabel != lbl { + continue + } + foundAttribute = true + break + } + if !foundAttribute { + continue + } + } + + switch r.AggregationKind { + case aggregation.SumKind, aggregation.HistogramKind: + return r.Sum.CoerceToFloat64(r.NumberKind) + case aggregation.LastValueKind: + return r.LastValue.CoerceToFloat64(r.NumberKind) + default: + panic(fmt.Sprintf("invalid aggregation type: %v", r.AggregationKind)) + } + } + panic("Could not locate a metric in test output") +} + +func TestHostCPU(t *testing.T) { + provider, exp := metrictest.NewTestMeterProvider() + err := Start( + WithMeterProvider(provider), + ) + assert.NoError(t, err) + + // Burn some CPU to be sure we're testing something below. + for start := time.Now(); time.Since(start) < time.Second/2; { + } + + // Note: we use a different library + // ("github.com/shirou/gopsutil/v3/process") to verify process + // CPU times computed from syscall.Getrusage(). + proc, err := process.NewProcess(int32(os.Getpid())) + require.NoError(t, err) + + ctx := context.Background() + processBefore, err := proc.TimesWithContext(ctx) + require.NoError(t, err) + + hostBefore, err := cpu.TimesWithContext(ctx, false) + require.NoError(t, err) + + start := time.Now() + for time.Since(start) < time.Second { + // This has a mix of user and system time, so serves + // the purpose of advancing both process and host, + // user and system CPU usage. + _, err = proc.TimesWithContext(ctx) + require.NoError(t, err) + } + + require.NoError(t, exp.Collect(ctx)) + + processUser := getMetric(exp, "process.cpu.time", AttributeCPUTimeUser[0]) + processSystem := getMetric(exp, "process.cpu.time", AttributeCPUTimeSystem[0]) + + hostUser := getMetric(exp, "system.cpu.time", AttributeCPUTimeUser[0]) + hostSystem := getMetric(exp, "system.cpu.time", AttributeCPUTimeSystem[0]) + + processAfter, err := proc.TimesWithContext(ctx) + require.NoError(t, err) + + hostAfter, err := cpu.TimesWithContext(ctx, false) + require.NoError(t, err) + + // Validate process times: + // User times are in range + require.LessOrEqual(t, processBefore.User, processUser) + require.GreaterOrEqual(t, processAfter.User, processUser) + // System times are in range + require.LessOrEqual(t, processBefore.System, processSystem) + require.GreaterOrEqual(t, processAfter.System, processSystem) + // Ranges are not empty + require.NotEqual(t, processAfter.System, processBefore.System) + require.NotEqual(t, processAfter.User, processBefore.User) + + // Validate host times: + // Correct assumptions: + require.Equal(t, 1, len(hostBefore)) + require.Equal(t, 1, len(hostAfter)) + // User times are in range + require.LessOrEqual(t, hostBefore[0].User, hostUser) + require.GreaterOrEqual(t, hostAfter[0].User, hostUser) + // System times are in range + require.LessOrEqual(t, hostBefore[0].System, hostSystem) + require.GreaterOrEqual(t, hostAfter[0].System, hostSystem) + // Ranges are not empty + require.NotEqual(t, hostAfter[0].System, hostBefore[0].System) + require.NotEqual(t, hostAfter[0].User, hostBefore[0].User) + // TODO: We are not testing host "Other" nor "Idle" and + // generally the specification hasn't been finalized, so + // there's more to do. Moreover, "Other" is not portable and + // "Idle" may not advance on a fully loaded machine => both + // are difficult to test. +} + +func TestHostMemory(t *testing.T) { + provider, exp := metrictest.NewTestMeterProvider() + err := Start( + WithMeterProvider(provider), + ) + assert.NoError(t, err) + + ctx := context.Background() + vMem, err := mem.VirtualMemoryWithContext(ctx) + require.NoError(t, err) + + require.NoError(t, exp.Collect(ctx)) + + hostUsed := getMetric(exp, "system.memory.usage", AttributeMemoryUsed[0]) + assert.Greater(t, hostUsed, 0.0) + assert.LessOrEqual(t, hostUsed, float64(vMem.Total)) + + hostAvailable := getMetric(exp, "system.memory.usage", AttributeMemoryAvailable[0]) + assert.GreaterOrEqual(t, hostAvailable, 0.0) + assert.Less(t, hostAvailable, float64(vMem.Total)) + + hostUsedUtil := getMetric(exp, "system.memory.utilization", AttributeMemoryUsed[0]) + assert.Greater(t, hostUsedUtil, 0.0) + assert.LessOrEqual(t, hostUsedUtil, 1.0) + + hostAvailableUtil := getMetric(exp, "system.memory.utilization", AttributeMemoryAvailable[0]) + assert.GreaterOrEqual(t, hostAvailableUtil, 0.0) + assert.Less(t, hostAvailableUtil, 1.0) + + if hostUsed > hostAvailable { + assert.Greater(t, hostUsedUtil, hostAvailableUtil) + } else { + assert.Less(t, hostUsedUtil, hostAvailableUtil) + } +} + +func sendBytes(t *testing.T, count int) error { + conn1, err := gonet.ListenPacket("udp", "127.0.0.1:0") + if err != nil { + return err + } + defer conn1.Close() + + conn2, err := gonet.ListenPacket("udp", "127.0.0.1:0") + if err != nil { + return err + } + defer conn2.Close() + + data1 := make([]byte, 1000) + data2 := make([]byte, 1000) + for i := range data1 { + data1[i] = byte(i) + } + + for ; count > 0; count -= len(data1) { + _, err = conn1.WriteTo(data1, conn2.LocalAddr()) + if err != nil { + return err + } + _, readAddr, err := conn2.ReadFrom(data2) + if err != nil { + return err + } + + require.Equal(t, "udp", readAddr.Network()) + require.Equal(t, conn1.LocalAddr().String(), readAddr.String()) + } + + return nil +} + +func TestHostNetwork(t *testing.T) { + provider, exp := metrictest.NewTestMeterProvider() + err := Start( + WithMeterProvider(provider), + ) + assert.NoError(t, err) + + ctx := context.Background() + hostBefore, err := net.IOCountersWithContext(ctx, false) + require.NoError(t, err) + + const howMuch = 10000 + err = sendBytes(t, howMuch) + require.NoError(t, err) + + // As we are going to read the /proc file system for this info, sleep a while: + require.Eventually(t, func() bool { + hostAfter, err := net.IOCountersWithContext(ctx, false) + require.NoError(t, err) + + return uint64(howMuch) <= hostAfter[0].BytesSent-hostBefore[0].BytesSent && + uint64(howMuch) <= hostAfter[0].BytesRecv-hostBefore[0].BytesRecv + }, 30*time.Second, time.Second/2) + + require.NoError(t, exp.Collect(ctx)) + hostTransmit := getMetric(exp, "system.network.io", AttributeNetworkTransmit[0]) + hostReceive := getMetric(exp, "system.network.io", AttributeNetworkReceive[0]) + + // Check that the recorded measurements reflect the same change: + require.LessOrEqual(t, uint64(howMuch), uint64(hostTransmit)-hostBefore[0].BytesSent) + require.LessOrEqual(t, uint64(howMuch), uint64(hostReceive)-hostBefore[0].BytesRecv) +} + +func TestProcessUptime(t *testing.T) { + ctx := context.Background() + y2k, err := time.Parse(time.RFC3339, "2000-01-01T00:00:00Z") + require.NoError(t, err) + expectUptime := time.Since(y2k).Seconds() + + var save time.Time + processStartTime, save = y2k, processStartTime + defer func() { + processStartTime = save + }() + + provider, exp := metrictest.NewTestMeterProvider() + h := newHost(config{ + MeterProvider: provider, + }) + require.NoError(t, h.register()) + + require.NoError(t, exp.Collect(ctx)) + procUptime := getMetric(exp, "process.uptime", attribute.KeyValue{}) + + require.LessOrEqual(t, expectUptime, procUptime) +} + +func TestProcessGCCPUTime(t *testing.T) { + ctx := context.Background() + + provider, exp := metrictest.NewTestMeterProvider() + h := newHost(config{ + MeterProvider: provider, + }) + require.NoError(t, h.register()) + + require.NoError(t, exp.Collect(ctx)) + initialUtime := getMetric(exp, "process.cpu.time", AttributeCPUTimeUser[0]) + initialStime := getMetric(exp, "process.cpu.time", AttributeCPUTimeSystem[0]) + initialGCtime := getMetric(exp, "process.runtime.go.gc.cpu.time", attribute.KeyValue{}) + + // Make garabge + for i := 0; i < 2; i++ { + var garbage []struct{} + for start := time.Now(); time.Since(start) < time.Second/16; { + garbage = append(garbage, struct{}{}) + } + garbage = nil + runtime.GC() + + require.NoError(t, exp.Collect(ctx)) + utime := -initialUtime + getMetric(exp, "process.cpu.time", AttributeCPUTimeUser[0]) + stime := -initialStime + getMetric(exp, "process.cpu.time", AttributeCPUTimeSystem[0]) + gctime := -initialGCtime + getMetric(exp, "process.runtime.go.gc.cpu.time", attribute.KeyValue{}) + + require.LessOrEqual(t, gctime, utime+stime) + } +} From 3263fc6d67632ec7e122b904afc2b8011e174167 Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 1 Sep 2022 17:06:10 -0700 Subject: [PATCH 2/7] chlog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fc1da73..d516f64c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## Unreleased +### Added + +- Proposed replacement for go-contrib instrumentation/host added as lightstep/instrumentation/hostprocess. [#268](https://github.com/lightstep/otel-launcher-go/pull/268) + ## [1.10.1](https://github.com/lightstep/otel-launcher-go/releases/tag/v1.10.1) - 2022-08-29 +### Changed + - Revert the default change of temporality to "cumulative" from #258. New users are recommended to configure `WithMetricExporterTemporalityPreference("stateless")` temporality From 5a05bbbd036fcf8a82d702744fc5087ada017264 Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 1 Sep 2022 21:07:28 -0700 Subject: [PATCH 3/7] reduce this PR to fork of host - process metrics --- .../{hostprocess => host}/doc.go | 11 +-- .../hostprocess.go => host/host.go} | 89 +------------------ .../hostprocess_test.go => host/host_test.go} | 78 +--------------- 3 files changed, 7 insertions(+), 171 deletions(-) rename lightstep/instrumentation/{hostprocess => host}/doc.go (75%) rename lightstep/instrumentation/{hostprocess/hostprocess.go => host/host.go} (70%) rename lightstep/instrumentation/{hostprocess/hostprocess_test.go => host/host_test.go} (73%) diff --git a/lightstep/instrumentation/hostprocess/doc.go b/lightstep/instrumentation/host/doc.go similarity index 75% rename from lightstep/instrumentation/hostprocess/doc.go rename to lightstep/instrumentation/host/doc.go index fbd4bb5b..604e8b49 100644 --- a/lightstep/instrumentation/hostprocess/doc.go +++ b/lightstep/instrumentation/host/doc.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package hostprocess provides the conventional host metrics +// Package host provides the conventional host metrics // specified by OpenTelemetry. Host metric events are sometimes // collected through the OpenTelemetry Collector "hostmetrics" // receiver running as an agent; this instrumentation is an @@ -25,19 +25,12 @@ // // ---------------------------------------------------------------------- // -// process.cpu.time state=user|system // system.cpu.time state=user|system|other|idle // system.memory.usage state=used|available // system.memory.utilization state=used|available // system.network.io direction=transmit|receive // -// These are runtime metrics that are not currently provided by the -// runtime/metrics package: -// -// process.runtime.uptime -// process.runtime.go.gc.cpu.time (see https://github.com/open-telemetry/opentelemetry-go-contrib/issues/316) -// // See https://github.com/open-telemetry/oteps/blob/main/text/0119-standard-system-metrics.md // for the definition of these metric instruments. -package hostprocess // import "github.com/lightstep/otel-launcher-go/lightstep/instrumentation/hostprocess" +package host // import "github.com/lightstep/otel-launcher-go/lightstep/instrumentation/host" diff --git a/lightstep/instrumentation/hostprocess/hostprocess.go b/lightstep/instrumentation/host/host.go similarity index 70% rename from lightstep/instrumentation/hostprocess/hostprocess.go rename to lightstep/instrumentation/host/host.go index 86ab25b4..5d0070bb 100644 --- a/lightstep/instrumentation/hostprocess/hostprocess.go +++ b/lightstep/instrumentation/host/host.go @@ -12,15 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -package hostprocess // import "github.com/lightstep/otel-launcher-go/lightstep/instrumentation/hostprocess" +package host // import "github.com/lightstep/otel-launcher-go/lightstep/instrumentation/host" import ( "context" "fmt" - "math" - "runtime" "sync" - "syscall" "time" "github.com/shirou/gopsutil/v3/cpu" @@ -123,15 +120,10 @@ func (h *host) register() error { var ( err error - processCPUTime asyncfloat64.Counter - processUptime asyncfloat64.UpDownCounter - processGCCPUTime asyncfloat64.Counter - hostCPUTime asyncfloat64.Counter - - hostMemoryUsage asyncint64.Gauge + hostCPUTime asyncfloat64.Counter + hostMemoryUsage asyncint64.UpDownCounter hostMemoryUtilization asyncfloat64.Gauge - - networkIOUsage asyncint64.Counter + networkIOUsage asyncint64.Counter // lock prevents a race between batch observer and instrument registration. lock sync.Mutex @@ -140,16 +132,6 @@ func (h *host) register() error { lock.Lock() defer lock.Unlock() - if processCPUTime, err = h.meter.AsyncFloat64().Counter( - "process.cpu.time", - instrument.WithUnit("s"), - instrument.WithDescription( - "Accumulated CPU time spent by this process attributed by state (User, System, ...)", - ), - ); err != nil { - return err - } - if hostCPUTime, err = h.meter.AsyncFloat64().Counter( "system.cpu.time", instrument.WithUnit("s"), @@ -189,27 +171,8 @@ func (h *host) register() error { return err } - if processUptime, err = h.meter.AsyncFloat64().UpDownCounter( - "process.uptime", - instrument.WithUnit("s"), - instrument.WithDescription("Seconds since application was initialized"), - ); err != nil { - return err - } - - if processGCCPUTime, err = h.meter.AsyncFloat64().UpDownCounter( - // Note: this name is selected so that if Go's runtime/metrics package - // were to start generating this it would be named /gc/cpu/time:seconds (float64). - "process.runtime.go.gc.cpu.time", - instrument.WithUnit("s"), - instrument.WithDescription("Seconds of garbage collection since application was initialized"), - ); err != nil { - return err - } - err = h.meter.RegisterCallback( []instrument.Asynchronous{ - processCPUTime, hostCPUTime, hostMemoryUsage, hostMemoryUtilization, @@ -219,8 +182,6 @@ func (h *host) register() error { lock.Lock() defer lock.Unlock() - processUser, processSystem, processGC, uptime := h.getProcessTimes() - hostTimeSlice, err := cpu.TimesWithContext(ctx, false) if err != nil { otel.Handle(err) @@ -247,16 +208,6 @@ func (h *host) register() error { return } - // Uptime - processUptime.Observe(ctx, uptime) - - // Process CPU time - processCPUTime.Observe(ctx, processUser, AttributeCPUTimeUser...) - processCPUTime.Observe(ctx, processSystem, AttributeCPUTimeSystem...) - - // Process GC CPU time - processGCCPUTime.Observe(ctx, processGC) - // Host CPU time hostTime := hostTimeSlice[0] hostCPUTime.Observe(ctx, hostTime.User, AttributeCPUTimeUser...) @@ -293,35 +244,3 @@ func (h *host) register() error { return nil } - -// getProcessTimes is called with the lock. Calls ReadMemStats() for -// GCCPUFraction because as of Go-1.19 there is no such runtime -// metric. User and system sum to 100% of CPU time (counter); gc is -// an independent, comparable metric value. These are correlated with uptime. -func (h *host) getProcessTimes() (userSeconds, systemSeconds, gcSeconds, uptimeSeconds float64) { - // Would really be better if runtime/metrics exposed this, - // making an expensive call for a single field that is not - // exposed via ReadMemStats(). - var memStats runtime.MemStats - runtime.ReadMemStats(&memStats) - - gomaxprocs := float64(runtime.GOMAXPROCS(0)) - - uptimeSeconds = time.Since(processStartTime).Seconds() - gcSeconds = memStats.GCCPUFraction * uptimeSeconds * gomaxprocs - - var ru syscall.Rusage - if err := syscall.Getrusage(syscall.RUSAGE_SELF, &ru); err != nil { - userSeconds = math.NaN() - systemSeconds = math.NaN() - otel.Handle(fmt.Errorf("getrusage: %w", err)) - return - } - - utime := time.Duration(ru.Utime.Sec)*time.Second + time.Duration(ru.Utime.Usec)*time.Microsecond - stime := time.Duration(ru.Stime.Sec)*time.Second + time.Duration(ru.Stime.Usec)*time.Microsecond - - userSeconds = utime.Seconds() - systemSeconds = stime.Seconds() - return -} diff --git a/lightstep/instrumentation/hostprocess/hostprocess_test.go b/lightstep/instrumentation/host/host_test.go similarity index 73% rename from lightstep/instrumentation/hostprocess/hostprocess_test.go rename to lightstep/instrumentation/host/host_test.go index c3706f81..614679fb 100644 --- a/lightstep/instrumentation/hostprocess/hostprocess_test.go +++ b/lightstep/instrumentation/host/host_test.go @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -package hostprocess +package host import ( "context" "fmt" gonet "net" "os" - "runtime" "testing" "time" @@ -85,8 +84,6 @@ func TestHostCPU(t *testing.T) { require.NoError(t, err) ctx := context.Background() - processBefore, err := proc.TimesWithContext(ctx) - require.NoError(t, err) hostBefore, err := cpu.TimesWithContext(ctx, false) require.NoError(t, err) @@ -102,29 +99,12 @@ func TestHostCPU(t *testing.T) { require.NoError(t, exp.Collect(ctx)) - processUser := getMetric(exp, "process.cpu.time", AttributeCPUTimeUser[0]) - processSystem := getMetric(exp, "process.cpu.time", AttributeCPUTimeSystem[0]) - hostUser := getMetric(exp, "system.cpu.time", AttributeCPUTimeUser[0]) hostSystem := getMetric(exp, "system.cpu.time", AttributeCPUTimeSystem[0]) - processAfter, err := proc.TimesWithContext(ctx) - require.NoError(t, err) - hostAfter, err := cpu.TimesWithContext(ctx, false) require.NoError(t, err) - // Validate process times: - // User times are in range - require.LessOrEqual(t, processBefore.User, processUser) - require.GreaterOrEqual(t, processAfter.User, processUser) - // System times are in range - require.LessOrEqual(t, processBefore.System, processSystem) - require.GreaterOrEqual(t, processAfter.System, processSystem) - // Ranges are not empty - require.NotEqual(t, processAfter.System, processBefore.System) - require.NotEqual(t, processAfter.User, processBefore.User) - // Validate host times: // Correct assumptions: require.Equal(t, 1, len(hostBefore)) @@ -249,59 +229,3 @@ func TestHostNetwork(t *testing.T) { require.LessOrEqual(t, uint64(howMuch), uint64(hostTransmit)-hostBefore[0].BytesSent) require.LessOrEqual(t, uint64(howMuch), uint64(hostReceive)-hostBefore[0].BytesRecv) } - -func TestProcessUptime(t *testing.T) { - ctx := context.Background() - y2k, err := time.Parse(time.RFC3339, "2000-01-01T00:00:00Z") - require.NoError(t, err) - expectUptime := time.Since(y2k).Seconds() - - var save time.Time - processStartTime, save = y2k, processStartTime - defer func() { - processStartTime = save - }() - - provider, exp := metrictest.NewTestMeterProvider() - h := newHost(config{ - MeterProvider: provider, - }) - require.NoError(t, h.register()) - - require.NoError(t, exp.Collect(ctx)) - procUptime := getMetric(exp, "process.uptime", attribute.KeyValue{}) - - require.LessOrEqual(t, expectUptime, procUptime) -} - -func TestProcessGCCPUTime(t *testing.T) { - ctx := context.Background() - - provider, exp := metrictest.NewTestMeterProvider() - h := newHost(config{ - MeterProvider: provider, - }) - require.NoError(t, h.register()) - - require.NoError(t, exp.Collect(ctx)) - initialUtime := getMetric(exp, "process.cpu.time", AttributeCPUTimeUser[0]) - initialStime := getMetric(exp, "process.cpu.time", AttributeCPUTimeSystem[0]) - initialGCtime := getMetric(exp, "process.runtime.go.gc.cpu.time", attribute.KeyValue{}) - - // Make garabge - for i := 0; i < 2; i++ { - var garbage []struct{} - for start := time.Now(); time.Since(start) < time.Second/16; { - garbage = append(garbage, struct{}{}) - } - garbage = nil - runtime.GC() - - require.NoError(t, exp.Collect(ctx)) - utime := -initialUtime + getMetric(exp, "process.cpu.time", AttributeCPUTimeUser[0]) - stime := -initialStime + getMetric(exp, "process.cpu.time", AttributeCPUTimeSystem[0]) - gctime := -initialGCtime + getMetric(exp, "process.runtime.go.gc.cpu.time", attribute.KeyValue{}) - - require.LessOrEqual(t, gctime, utime+stime) - } -} From b33c8677a1f9867222abc91c807e01c23e89e9c9 Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 1 Sep 2022 21:08:11 -0700 Subject: [PATCH 4/7] chlog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d516f64c..980f3d9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Added -- Proposed replacement for go-contrib instrumentation/host added as lightstep/instrumentation/hostprocess. [#268](https://github.com/lightstep/otel-launcher-go/pull/268) +- Reduced replacement for go-contrib instrumentation/host added as + lightstep/instrumentation/host; same code but removes process metrics + [#268](https://github.com/lightstep/otel-launcher-go/pull/268) ## [1.10.1](https://github.com/lightstep/otel-launcher-go/releases/tag/v1.10.1) - 2022-08-29 From e9d57e9c49e797af43626e7d21f778a8b98f0cef Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 1 Sep 2022 21:32:32 -0700 Subject: [PATCH 5/7] lint --- lightstep/instrumentation/host/host.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lightstep/instrumentation/host/host.go b/lightstep/instrumentation/host/host.go index 5d0070bb..c273495f 100644 --- a/lightstep/instrumentation/host/host.go +++ b/lightstep/instrumentation/host/host.go @@ -18,7 +18,6 @@ import ( "context" "fmt" "sync" - "time" "github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/mem" @@ -34,9 +33,6 @@ import ( "go.opentelemetry.io/otel/metric/unit" ) -// processStartTime should be initialized before the first GC, ideally. -var processStartTime = time.Now() - // Host reports the work-in-progress conventional host metrics specified by OpenTelemetry. type host struct { meter metric.Meter From 44ff1192d0d2a701aa4fd6cd75f5270c90f46fdb Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 1 Sep 2022 21:37:59 -0700 Subject: [PATCH 6/7] unnecessary burn --- lightstep/instrumentation/host/host_test.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lightstep/instrumentation/host/host_test.go b/lightstep/instrumentation/host/host_test.go index 614679fb..f6e36f04 100644 --- a/lightstep/instrumentation/host/host_test.go +++ b/lightstep/instrumentation/host/host_test.go @@ -73,10 +73,6 @@ func TestHostCPU(t *testing.T) { ) assert.NoError(t, err) - // Burn some CPU to be sure we're testing something below. - for start := time.Now(); time.Since(start) < time.Second/2; { - } - // Note: we use a different library // ("github.com/shirou/gopsutil/v3/process") to verify process // CPU times computed from syscall.Getrusage(). From 825101ed8361f2e5dd3b22e1ee9a1bdec6dcf6cb Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 1 Sep 2022 22:02:21 -0700 Subject: [PATCH 7/7] tidy --- go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 77837835..7293b455 100644 --- a/go.mod +++ b/go.mod @@ -6,10 +6,12 @@ require ( github.com/lightstep/otel-launcher-go/lightstep/sdk/metric v1.10.1 github.com/lightstep/otel-launcher-go/pipelines v1.10.1 github.com/sethvargo/go-envconfig v0.8.2 + github.com/shirou/gopsutil/v3 v3.22.6 github.com/stretchr/testify v1.8.0 go.opentelemetry.io/otel v1.9.0 go.opentelemetry.io/otel/metric v0.31.0 go.opentelemetry.io/otel/sdk v1.9.0 + go.opentelemetry.io/otel/sdk/metric v0.31.1-0.20220826135333-55b49c407e07 go.opentelemetry.io/otel/trace v1.9.0 ) @@ -26,7 +28,6 @@ require ( github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect - github.com/shirou/gopsutil/v3 v3.22.6 // indirect github.com/tklauser/go-sysconf v0.3.10 // indirect github.com/tklauser/numcpus v0.4.0 // indirect github.com/yusufpapurcu/wmi v1.2.2 // indirect @@ -39,7 +40,6 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.31.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.9.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.9.0 // indirect - go.opentelemetry.io/otel/sdk/metric v0.31.1-0.20220826135333-55b49c407e07 // indirect go.opentelemetry.io/proto/otlp v0.18.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.8.0 // indirect