Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: improve reading Go runtime metrics #25886

Merged
merged 30 commits into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
445187c
metrics: WIP use runtime/metrics
fjl Sep 27, 2022
41cc612
metrics: fixup
fjl Sep 27, 2022
993a51b
metrics: use gauge for gc metrics
fjl Sep 27, 2022
516f5a9
metrics: update
fjl Sep 27, 2022
1a4bbce
metrics: update
fjl Sep 27, 2022
7c1a4ac
metrics: improve loop
fjl Sep 27, 2022
41706c8
metrics: track alloc/free in objects not gc
fjl Sep 27, 2022
c0b8c7b
metrics: rename variables
fjl Sep 27, 2022
3705d41
metrics: fixup
fjl Sep 27, 2022
69e9bbf
metrics: track GC alloc/free in bytes
fjl Sep 28, 2022
182856d
metrics: skip invalid/unknown Go runtime metrics
fjl Sep 28, 2022
87ca27d
metrics: explain why skipped
fjl Sep 28, 2022
58194f5
metrics: change GC pauses back to meter
fjl Sep 28, 2022
432760b
metrics: add Float64Histogram adapter
fjl Nov 9, 2022
32a49ce
metrics: fix some issues in runtimeHistogram
fjl Nov 9, 2022
42afa68
metrics: add tests
fjl Nov 9, 2022
e223b03
metrics: handle nil histogram updates
fjl Nov 9, 2022
d5142b0
metrics: add some more runtime metrics
fjl Nov 9, 2022
a8dd515
metrics: improve handling of +Inf value in bucket
fjl Nov 9, 2022
e0712aa
metrics: actually read new runtime metrics
fjl Nov 9, 2022
4419d87
metrics: add scale factor
fjl Nov 9, 2022
0db04a1
metrics: add +Inf workaround in Max
fjl Nov 9, 2022
825acd1
metrics/influxdb: add .25 percentile in histogram output
fjl Nov 9, 2022
3503c11
metrics: remove allocs-bysize
fjl Nov 9, 2022
152df4e
metrics: add more heap metrics
fjl Nov 9, 2022
fdc0ba3
metrics: fix test
fjl Nov 9, 2022
2aed4e3
metrics: fix heapReleased metric name
fjl Nov 9, 2022
2952f6c
metrics: add heap free metric
fjl Nov 9, 2022
924584b
metrics: change memory metrics again
fjl Nov 9, 2022
39aad8b
metrics: remove gc goal
fjl Nov 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions metrics/influxdb/influxdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,27 +160,28 @@ func (r *reporter) send() error {
})
case metrics.Histogram:
ms := metric.Snapshot()

if ms.Count() > 0 {
ps := ms.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999, 0.9999})
ps := ms.Percentiles([]float64{0.25, 0.5, 0.75, 0.95, 0.99, 0.999, 0.9999})
fields := map[string]interface{}{
"count": ms.Count(),
"max": ms.Max(),
"mean": ms.Mean(),
"min": ms.Min(),
"stddev": ms.StdDev(),
"variance": ms.Variance(),
"p25": ps[0],
"p50": ps[1],
"p75": ps[2],
"p95": ps[3],
"p99": ps[4],
"p999": ps[5],
"p9999": ps[6],
}
pts = append(pts, client.Point{
Measurement: fmt.Sprintf("%s%s.histogram", namespace, name),
Tags: r.tags,
Fields: map[string]interface{}{
"count": ms.Count(),
"max": ms.Max(),
"mean": ms.Mean(),
"min": ms.Min(),
"stddev": ms.StdDev(),
"variance": ms.Variance(),
"p50": ps[0],
"p75": ps[1],
"p95": ps[2],
"p99": ps[3],
"p999": ps[4],
"p9999": ps[5],
},
Time: now,
Fields: fields,
Time: now,
})
}
case metrics.Meter:
Expand Down
182 changes: 133 additions & 49 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ package metrics

import (
"os"
"runtime"
"runtime/metrics"
"runtime/pprof"
"strings"
"time"

Expand Down Expand Up @@ -54,73 +55,156 @@ func init() {
}
}

// CollectProcessMetrics periodically collects various metrics about the running
// process.
var threadCreateProfile = pprof.Lookup("threadcreate")

type runtimeStats struct {
GCPauses *metrics.Float64Histogram
GCAllocBytes uint64
GCFreedBytes uint64
GCHeapGoal uint64

MemTotal uint64
HeapObjects uint64
HeapFree uint64
HeapReleased uint64
HeapUnused uint64

Goroutines uint64
SchedLatency *metrics.Float64Histogram
}

var runtimeSamples = []metrics.Sample{
{Name: "/gc/pauses:seconds"}, // histogram
{Name: "/gc/heap/allocs:bytes"},
{Name: "/gc/heap/frees:bytes"},
{Name: "/gc/heap/goal:bytes"},
{Name: "/memory/classes/total:bytes"},
{Name: "/memory/classes/heap/objects:bytes"},
{Name: "/memory/classes/heap/free:bytes"},
{Name: "/memory/classes/heap/released:bytes"},
{Name: "/memory/classes/heap/unused:bytes"},
{Name: "/sched/goroutines:goroutines"},
{Name: "/sched/latencies:seconds"}, // histogram
}

func readRuntimeStats(v *runtimeStats) {
metrics.Read(runtimeSamples)
for _, s := range runtimeSamples {
// Skip invalid/unknown metrics. This is needed because some metrics
// are unavailable in older Go versions, and attempting to read a 'bad'
// metric panics.
if s.Value.Kind() == metrics.KindBad {
continue
}

switch s.Name {
case "/gc/pauses:seconds":
v.GCPauses = s.Value.Float64Histogram()
case "/gc/heap/allocs:bytes":
v.GCAllocBytes = s.Value.Uint64()
case "/gc/heap/frees:bytes":
v.GCFreedBytes = s.Value.Uint64()
case "/gc/heap/goal:bytes":
v.GCHeapGoal = s.Value.Uint64()
case "/memory/classes/total:bytes":
v.MemTotal = s.Value.Uint64()
case "/memory/classes/heap/objects:bytes":
v.HeapObjects = s.Value.Uint64()
case "/memory/classes/heap/free:bytes":
v.HeapFree = s.Value.Uint64()
case "/memory/classes/heap/released:bytes":
v.HeapReleased = s.Value.Uint64()
case "/memory/classes/heap/unused:bytes":
v.HeapUnused = s.Value.Uint64()
case "/sched/goroutines:goroutines":
v.Goroutines = s.Value.Uint64()
case "/sched/latencies:seconds":
v.SchedLatency = s.Value.Float64Histogram()
}
}
}

// CollectProcessMetrics periodically collects various metrics about the running process.
func CollectProcessMetrics(refresh time.Duration) {
// Short circuit if the metrics system is disabled
if !Enabled {
return
}

refreshFreq := int64(refresh / time.Second)

// Create the various data collectors
cpuStats := make([]*CPUStats, 2)
memstats := make([]*runtime.MemStats, 2)
diskstats := make([]*DiskStats, 2)
for i := 0; i < len(memstats); i++ {
cpuStats[i] = new(CPUStats)
memstats[i] = new(runtime.MemStats)
diskstats[i] = new(DiskStats)
}
// Define the various metrics to collect
var (
cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry)
cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry)
cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry)
cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry)
cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry)

memPauses = GetOrRegisterMeter("system/memory/pauses", DefaultRegistry)
memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry)
memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry)
memHeld = GetOrRegisterGauge("system/memory/held", DefaultRegistry)
memUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry)
cpustats = make([]CPUStats, 2)
diskstats = make([]DiskStats, 2)
rstats = make([]runtimeStats, 2)
)

// This scale factor is used for the runtime's time metrics. It's useful to convert to
// ns here because the runtime gives times in float seconds, but runtimeHistogram can
// only provide integers for the minimum and maximum values.
const secondsToNs = float64(time.Second)

// Define the various metrics to collect
var (
cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry)
cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry)
cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry)
cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry)
cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry)
cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil)
memPauses = getOrRegisterRuntimeHistogram("system/memory/pauses", secondsToNs, nil)
memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry)
memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry)
memTotal = GetOrRegisterGauge("system/memory/held", DefaultRegistry)
heapGCGoal = GetOrRegisterGauge("system/memory/gcgoal", DefaultRegistry)
heapUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry)
heapObjects = GetOrRegisterGauge("system/memory/objects", DefaultRegistry)
diskReads = GetOrRegisterMeter("system/disk/readcount", DefaultRegistry)
diskReadBytes = GetOrRegisterMeter("system/disk/readdata", DefaultRegistry)
diskReadBytesCounter = GetOrRegisterCounter("system/disk/readbytes", DefaultRegistry)
diskWrites = GetOrRegisterMeter("system/disk/writecount", DefaultRegistry)
diskWriteBytes = GetOrRegisterMeter("system/disk/writedata", DefaultRegistry)
diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry)
)
// Iterate loading the different stats and updating the meters
for i := 1; ; i++ {
location1 := i % 2
location2 := (i - 1) % 2

ReadCPUStats(cpuStats[location1])
cpuSysLoad.Update((cpuStats[location1].GlobalTime - cpuStats[location2].GlobalTime) / refreshFreq)
cpuSysWait.Update((cpuStats[location1].GlobalWait - cpuStats[location2].GlobalWait) / refreshFreq)
cpuProcLoad.Update((cpuStats[location1].LocalTime - cpuStats[location2].LocalTime) / refreshFreq)

// Iterate loading the different stats and updating the meters.
now, prev := 0, 1
for ; ; now, prev = prev, now {
// CPU
ReadCPUStats(&cpustats[now])
cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq)
cpuSysWait.Update((cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq)
cpuProcLoad.Update((cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq)

// Threads
cpuThreads.Update(int64(threadCreateProfile.Count()))
cpuGoroutines.Update(int64(runtime.NumGoroutine()))

runtime.ReadMemStats(memstats[location1])
memPauses.Mark(int64(memstats[location1].PauseTotalNs - memstats[location2].PauseTotalNs))
memAllocs.Mark(int64(memstats[location1].Mallocs - memstats[location2].Mallocs))
memFrees.Mark(int64(memstats[location1].Frees - memstats[location2].Frees))
memHeld.Update(int64(memstats[location1].HeapSys - memstats[location1].HeapReleased))
memUsed.Update(int64(memstats[location1].Alloc))

if ReadDiskStats(diskstats[location1]) == nil {
diskReads.Mark(diskstats[location1].ReadCount - diskstats[location2].ReadCount)
diskReadBytes.Mark(diskstats[location1].ReadBytes - diskstats[location2].ReadBytes)
diskWrites.Mark(diskstats[location1].WriteCount - diskstats[location2].WriteCount)
diskWriteBytes.Mark(diskstats[location1].WriteBytes - diskstats[location2].WriteBytes)

diskReadBytesCounter.Inc(diskstats[location1].ReadBytes - diskstats[location2].ReadBytes)
diskWriteBytesCounter.Inc(diskstats[location1].WriteBytes - diskstats[location2].WriteBytes)

// Go runtime metrics
readRuntimeStats(&rstats[now])

cpuGoroutines.Update(int64(rstats[now].Goroutines))
cpuSchedLatency.update(rstats[now].SchedLatency)
memPauses.update(rstats[now].GCPauses)

memAllocs.Mark(int64(rstats[now].GCAllocBytes - rstats[prev].GCAllocBytes))
memFrees.Mark(int64(rstats[now].GCFreedBytes - rstats[prev].GCFreedBytes))

memTotal.Update(int64(rstats[now].MemTotal))
heapUsed.Update(int64(rstats[now].MemTotal - rstats[now].HeapUnused - rstats[now].HeapFree - rstats[now].HeapReleased))
heapObjects.Update(int64(rstats[now].HeapObjects))
heapGCGoal.Update(int64(rstats[now].GCHeapGoal))

// Disk
if ReadDiskStats(&diskstats[now]) == nil {
diskReads.Mark(diskstats[now].ReadCount - diskstats[prev].ReadCount)
diskReadBytes.Mark(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
diskWrites.Mark(diskstats[now].WriteCount - diskstats[prev].WriteCount)
diskWriteBytes.Mark(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
diskReadBytesCounter.Inc(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
diskWriteBytesCounter.Inc(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
}

time.Sleep(refresh)
}
}
32 changes: 5 additions & 27 deletions metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,18 @@ package metrics

import (
"fmt"
"io"
"log"
"sync"
"testing"
"time"
)

const FANOUT = 128

// Stop the compiler from complaining during debugging.
var (
_ = io.Discard
_ = log.LstdFlags
)
func TestReadRuntimeValues(t *testing.T) {
var v runtimeStats
readRuntimeStats(&v)
t.Logf("%+v", v)
}

func BenchmarkMetrics(b *testing.B) {
r := NewRegistry()
Expand All @@ -26,7 +24,6 @@ func BenchmarkMetrics(b *testing.B) {
m := NewRegisteredMeter("meter", r)
t := NewRegisteredTimer("timer", r)
RegisterDebugGCStats(r)
RegisterRuntimeMemStats(r)
b.ResetTimer()
ch := make(chan bool)

Expand All @@ -48,24 +45,6 @@ func BenchmarkMetrics(b *testing.B) {
}()
//*/

wgR := &sync.WaitGroup{}
//*
wgR.Add(1)
go func() {
defer wgR.Done()
//log.Println("go CaptureRuntimeMemStats")
for {
select {
case <-ch:
//log.Println("done CaptureRuntimeMemStats")
return
default:
CaptureRuntimeMemStatsOnce(r)
}
}
}()
//*/

wgW := &sync.WaitGroup{}
/*
wgW.Add(1)
Expand Down Expand Up @@ -104,7 +83,6 @@ func BenchmarkMetrics(b *testing.B) {
wg.Wait()
close(ch)
wgD.Wait()
wgR.Wait()
wgW.Wait()
}

Expand Down
Loading