Skip to content

Commit

Permalink
runtime/pprof: introduce hardware performance counters for CPU profil…
Browse files Browse the repository at this point in the history
…ing.

The feature is available on Linux-based systems on 386, amd64, and arm64
architectures. The checkin introduces a new API:
'pprof.StartCPUProfileWithConfig(opt ProfilingOption, moreOpts
...ProfilingOption) error' to the runtime/pprof package. ProfilingOption can be
one of the following:

func OSTimer(w io.Writer) ProfilingOption
func CPUCycles(w io.Writer, period uint64) ProfilingOption
func CPUInstructions(w io.Writer, period uint64) ProfilingOption
func CPUCacheReferences(w io.Writer, period uint64) ProfilingOption
func CPUCacheMisses(w io.Writer, period uint64) ProfilingOption
func CPUBranchInstructions(w io.Writer, period uint64) ProfilingOption
func CPUBranchMisses(w io.Writer, period uint64) ProfilingOption
func CPURawEvent(w io.Writer, period uint64, hex uint64) ProfilingOption

All of them consume an io.Writer. The OSTimer accepts no other argument. A
period accepted by all PMU events specifies the number of events that must
elapse between two consecutive interrupts. A larger value means lower
granularity (the opposite of Hz). Passing a zero value will result in using a
preset period. CPURawEvent is special; it accepts any user-provided hexadecimal
PMU event code. Each event requires an io.Writer since different profile types
cannot be serialized into the same pprof protocol buffer.

Multiple sampling agents can be on at the same time, which is useful in
profiling, say, CPU cache hit vs. misses in a single run. SIGPROF signal
handler distinguishes whether the sample was delivered from OS timer or
performance monitoring unit (PMU) and takes the appropriate action.

A new runtime API runtime_pprof_setCPUProfileConfig(eventId int, profConfig
*CPUProfileConfig), exposed only to pprof, is used to setup different PMU events
such as CPUPROF_HW_CPU_CYCLES, CPUPROF_HW_INSTRUCTIONS etc. along with their
sampling period.

runtime/pprof: added tests to exercise PMU-based profiling.

Tests check if they are running on a supported platform (Linux, amd64, 386,
arm64, no VM) and run each test with PMU cycles profiling event in addition to
OS timer-based CPU profiling.
  • Loading branch information
chabbimilind committed Feb 17, 2021
1 parent dd93d6a commit 334f83e
Show file tree
Hide file tree
Showing 24 changed files with 939 additions and 263 deletions.
10 changes: 10 additions & 0 deletions api/go1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5743,18 +5743,28 @@ pkg runtime, type TypeAssertionError struct
pkg runtime, var MemProfileRate int
pkg runtime/debug, func PrintStack()
pkg runtime/debug, func Stack() []uint8
pkg runtime/pprof, func CPUBranchInstructions(io.Writer, uint64) ProfilingOption
pkg runtime/pprof, func CPUBranchMisses(io.Writer, uint64) ProfilingOption
pkg runtime/pprof, func CPUCacheMisses(io.Writer, uint64) ProfilingOption
pkg runtime/pprof, func CPUCacheReferences(io.Writer, uint64) ProfilingOption
pkg runtime/pprof, func CPUCycles(io.Writer, uint64) ProfilingOption
pkg runtime/pprof, func CPUInstructions(io.Writer, uint64) ProfilingOption
pkg runtime/pprof, func CPURawEvent(io.Writer, uint64, uint64) ProfilingOption
pkg runtime/pprof, func Lookup(string) *Profile
pkg runtime/pprof, func NewProfile(string) *Profile
pkg runtime/pprof, func OSTimer(io.Writer) ProfilingOption
pkg runtime/pprof, func Profiles() []*Profile
pkg runtime/pprof, func StartCPUProfile(io.Writer) error
pkg runtime/pprof, func StopCPUProfile()
pkg runtime/pprof, func WriteHeapProfile(io.Writer) error
pkg runtime/pprof, func StartCPUProfileWithConfig(ProfilingOption, ...ProfilingOption) error
pkg runtime/pprof, method (*Profile) Add(interface{}, int)
pkg runtime/pprof, method (*Profile) Count() int
pkg runtime/pprof, method (*Profile) Name() string
pkg runtime/pprof, method (*Profile) Remove(interface{})
pkg runtime/pprof, method (*Profile) WriteTo(io.Writer, int) error
pkg runtime/pprof, type Profile struct
pkg runtime/pprof, type ProfilingOption interface, unexported methods
pkg sort, func Float64s([]float64)
pkg sort, func Float64sAreSorted([]float64) bool
pkg sort, func Ints([]int)
Expand Down
141 changes: 97 additions & 44 deletions src/runtime/cpuprof.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ type cpuProfile struct {
lostAtomic uint64 // count of frames lost because of being in atomic64 on mips/arm; updated racily
}

var cpuprof cpuProfile
var cpuprof [_CPUPROF_EVENTS_MAX]cpuProfile

// SetCPUProfileRate sets the CPU profiling rate to hz samples per second.
// If hz <= 0, SetCPUProfileRate turns off profiling.
Expand All @@ -125,26 +125,81 @@ func SetCPUProfileRate(hz int) {
hz = 1000000
}

lock(&cpuprof.lock)
if hz > 0 {
if cpuprof.on || cpuprof.log != nil {
print("runtime: cannot set cpu profile rate until previous profile has finished.\n")
unlock(&cpuprof.lock)
var profConfig cpuProfileConfig
profConfig.hz = uint64(hz)
runtime_pprof_setCPUProfileConfig(_CPUPROF_OS_TIMER, &profConfig)
} else {
runtime_pprof_setCPUProfileConfig(_CPUPROF_OS_TIMER, nil)
}
}

func sanitizeCPUProfileConfig(profConfig *cpuProfileConfig) {
if profConfig == nil {
return
}
profConfig.preciseIP = _CPUPROF_IP_ARBITRARY_SKID
profConfig.isSampleIPIncluded = false
profConfig.isSampleThreadIDIncluded = false
profConfig.isSampleAddrIncluded = false
profConfig.isSampleCallchainIncluded = false
profConfig.isKernelIncluded = false
profConfig.isHvIncluded = false
profConfig.isIdleIncluded = false
profConfig.isCallchainKernelIncluded = false
profConfig.isCallchainUserIncluded = false
}

// setCPUProfileConfig, provided to runtime/pprof, enables/disables CPU profiling for a specified CPU event.
// Profiling cannot be enabled if it is already enabled.
// eventId: specifies the event to enable/disable. eventId can be one of the following values:
// _CPUPROF_OS_TIMER, _CPUPROF_HW_CPU_CYCLES, _CPUPROF_HW_INSTRUCTIONS, _CPUPROF_HW_CACHE_REFERENCES,
// _CPUPROF_HW_CACHE_MISSES, CPUPROF_HW_CACHE_LL_READ_ACCESSES, CPUPROF_HW_CACHE_LL_READ_MISSES, _CPUPROF_HW_RAW
// profConfig: provides additional configurations when enabling the specified event.
// A nil profConfig results in disabling the said event.
// TODO: should we make this function return an error?
//
//go:linkname runtime_pprof_setCPUProfileConfig runtime/pprof.setCPUProfileConfig
func runtime_pprof_setCPUProfileConfig(eventId cpuEvent, profConfig *cpuProfileConfig) {
if eventId >= _CPUPROF_EVENTS_MAX {
return
}

lock(&cpuprof[eventId].lock)
defer unlock(&cpuprof[eventId].lock)
if profConfig != nil {
if cpuprof[eventId].on || cpuprof[eventId].log != nil {
print("runtime: cannot set cpu profile config until previous profile has finished.\n")
return
}

cpuprof.on = true
cpuprof.log = newProfBuf(1, 1<<17, 1<<14)
hdr := [1]uint64{uint64(hz)}
cpuprof.log.write(nil, nanotime(), hdr[:], nil)
setcpuprofilerate(int32(hz))
} else if cpuprof.on {
setcpuprofilerate(0)
cpuprof.on = false
cpuprof.addExtra()
cpuprof.log.close()
cpuprof[eventId].on = true
// Enlarging the buffer words and tags reduces the number of samples lost at the cost of larger amounts of memory
cpuprof[eventId].log = newProfBuf( /* header size */ 1 /* buffer words */, 1<<17 /* tags */, 1<<14)
// OS timer profiling provides the sampling rate (sample/sec), whereas the other PMU-based events provide
// sampling interval (aka period), which is the the number of events to elapse before a sample is triggered.
// The latter is called as "event-based sampling". In event-based sampling, the overhead is proportional to the
// number of events; no events imples no overhead.
// On Linux-based systems perf_event_open() allows configuring PMU-events in a "Hz" mode; but that is for later.
if eventId == _CPUPROF_OS_TIMER {
hdr := [1]uint64{profConfig.hz}
cpuprof[eventId].log.write(nil, nanotime(), hdr[:], nil)
} else {
hdr := [1]uint64{profConfig.period}
cpuprof[eventId].log.write(nil, nanotime(), hdr[:], nil)
}
// Take a copy of the profConfig passed by the user, so that the runtime functions are not affected
// if the user code changes the attributes.
cfg := make([]cpuProfileConfig, 1, 1)
cfg[0] = *profConfig
sanitizeCPUProfileConfig(&cfg[0])
setcpuprofileconfig(eventId, &cfg[0])
} else if cpuprof[eventId].on {
setcpuprofileconfig(eventId, nil)
cpuprof[eventId].on = false
cpuprof[eventId].addExtra()
cpuprof[eventId].log.close()
}
unlock(&cpuprof.lock)
}

// add adds the stack trace to the profile.
Expand All @@ -153,13 +208,12 @@ func SetCPUProfileRate(hz int) {
// held at the time of the signal, nor can it use substantial amounts
// of stack.
//go:nowritebarrierrec
func (p *cpuProfile) add(gp *g, stk []uintptr) {
// Simple cas-lock to coordinate with setcpuprofilerate.
for !atomic.Cas(&prof.signalLock, 0, 1) {
func (p *cpuProfile) add(gp *g, stk []uintptr, eventId cpuEvent) {
profCfg := &prof[eventId]
for !atomic.Cas(&signalLock, 0, 1) {
osyield()
}

if prof.hz != 0 { // implies cpuprof.log != nil
if profCfg.config != nil { // implies cpuprof[eventId].log != nil
if p.numExtra > 0 || p.lostExtra > 0 || p.lostAtomic > 0 {
p.addExtra()
}
Expand All @@ -168,40 +222,38 @@ func (p *cpuProfile) add(gp *g, stk []uintptr) {
// because otherwise its write barrier behavior may not
// be correct. See the long comment there before
// changing the argument here.
cpuprof.log.write(&gp.labels, nanotime(), hdr[:], stk)
cpuprof[eventId].log.write(&gp.labels, nanotime(), hdr[:], stk)
}

atomic.Store(&prof.signalLock, 0)
atomic.Store(&signalLock, 0)
}

// addNonGo adds the non-Go stack trace to the profile.
// It is called from a non-Go thread, so we cannot use much stack at all,
// nor do anything that needs a g or an m.
// In particular, we can't call cpuprof.log.write.
// Instead, we copy the stack into cpuprof.extra,
// In particular, we can't call cpuprof[id].log.write.
// Instead, we copy the stack into cpuprof[id].extra,
// which will be drained the next time a Go thread
// gets the signal handling event.
//go:nosplit
//go:nowritebarrierrec
func (p *cpuProfile) addNonGo(stk []uintptr) {
func (p *cpuProfile) addNonGo(stk []uintptr, eventId cpuEvent) {
// Simple cas-lock to coordinate with SetCPUProfileRate.
// (Other calls to add or addNonGo should be blocked out
// by the fact that only one SIGPROF can be handled by the
// process at a time. If not, this lock will serialize those too.)
for !atomic.Cas(&prof.signalLock, 0, 1) {
for !atomic.Cas(&signalLock, 0, 1) {
osyield()
}

if cpuprof.numExtra+1+len(stk) < len(cpuprof.extra) {
i := cpuprof.numExtra
cpuprof.extra[i] = uintptr(1 + len(stk))
copy(cpuprof.extra[i+1:], stk)
cpuprof.numExtra += 1 + len(stk)
prof := &cpuprof[eventId]
if prof.numExtra+1+len(stk) < len(prof.extra) {
i := prof.numExtra
prof.extra[i] = uintptr(1 + len(stk))
copy(prof.extra[i+1:], stk)
prof.numExtra += 1 + len(stk)
} else {
cpuprof.lostExtra++
prof.lostExtra++
}

atomic.Store(&prof.signalLock, 0)
atomic.Store(&signalLock, 0)
}

// addExtra adds the "extra" profiling events,
Expand Down Expand Up @@ -266,15 +318,16 @@ func runtime_pprof_runtime_cyclesPerSecond() int64 {
// The caller must save the returned data and tags before calling readProfile again.
//
//go:linkname runtime_pprof_readProfile runtime/pprof.readProfile
func runtime_pprof_readProfile() ([]uint64, []unsafe.Pointer, bool) {
lock(&cpuprof.lock)
log := cpuprof.log
unlock(&cpuprof.lock)
func runtime_pprof_readProfile(eventId cpuEvent) ([]uint64, []unsafe.Pointer, bool) {
prof := &cpuprof[eventId]
lock(&prof.lock)
log := prof.log
unlock(&prof.lock)
data, tags, eof := log.read(profBufBlocking)
if len(data) == 0 && eof {
lock(&cpuprof.lock)
cpuprof.log = nil
unlock(&cpuprof.lock)
lock(&prof.lock)
prof.log = nil
unlock(&prof.lock)
}
return data, tags, eof
}
2 changes: 2 additions & 0 deletions src/runtime/defs_aix_ppc64.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ const (
_SIG_UNBLOCK = 0x1
_SIG_SETMASK = 0x2

_POLL_IN = 0xdeadbeef // only for compilation

_SA_SIGINFO = 0x100
_SA_RESTART = 0x8
_SA_ONSTACK = 0x1
Expand Down
1 change: 1 addition & 0 deletions src/runtime/os2_freebsd.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const (
_SS_DISABLE = 4
_NSIG = 33
_SI_USER = 0x10001
_POLL_IN = 0x1 // taken from https://github.com/freebsd/freebsd/blob/2e1c48e4b2db19ac271c688a4145fd41348f0374/sys/sys/signal.h
_SIG_BLOCK = 1
_SIG_UNBLOCK = 2
_SIG_SETMASK = 3
Expand Down
1 change: 1 addition & 0 deletions src/runtime/os2_openbsd.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ const (
_SIG_SETMASK = 3
_NSIG = 33
_SI_USER = 0
_POLL_IN = 1
)
1 change: 1 addition & 0 deletions src/runtime/os2_solaris.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ const (
_SIG_SETMASK = 3
_NSIG = 73 /* number of signals in sigtable array */
_SI_USER = 0
_POLL_IN = 0xdeadbeef // cannot find the code and PMU profiling is not enabled on Solaris.
)
6 changes: 3 additions & 3 deletions src/runtime/os3_plan9.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,12 @@ func sigdisable(sig uint32) {
func sigignore(sig uint32) {
}

func setProcessCPUProfiler(hz int32) {
func setProcessCPUProfiler(profConfig *cpuProfileConfig) {
}

func setThreadCPUProfiler(hz int32) {
func setThreadOSTimerProfiler(profConfig *cpuProfileConfig) {
// TODO: Enable profiling interrupts.
getg().m.profilehz = hz
getg().m.profConfig[_CPUPROF_OS_TIMER] = profConfig
}

// gsignalStack is unused on Plan 9.
Expand Down
1 change: 1 addition & 0 deletions src/runtime/os_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ func osyield() {
const (
_NSIG = 32
_SI_USER = 0 /* empirically true, but not what headers say */
_POLL_IN = 1 /* obtained from https://github.com/apple/darwin-xnu/blob/master/bsd/sys/signal.h */
_SIG_BLOCK = 1
_SIG_UNBLOCK = 2
_SIG_SETMASK = 3
Expand Down
1 change: 1 addition & 0 deletions src/runtime/os_dragonfly.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
const (
_NSIG = 33
_SI_USER = 0
_POLL_IN = 1
_SS_DISABLE = 4
_SIG_BLOCK = 1
_SIG_UNBLOCK = 2
Expand Down
10 changes: 5 additions & 5 deletions src/runtime/os_js.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,11 @@ func newosproc(mp *m) {
panic("newosproc: not implemented")
}

func setProcessCPUProfiler(hz int32) {}
func setThreadCPUProfiler(hz int32) {}
func sigdisable(uint32) {}
func sigenable(uint32) {}
func sigignore(uint32) {}
func setProcessCPUProfiler(profConfig *cpuProfileConfig) {}
func setThreadOSTimerProfiler(profConfig *cpuProfileConfig) {}
func sigdisable(uint32) {}
func sigenable(uint32) {}
func sigignore(uint32) {}

//go:linkname os_sigpipe os.sigpipe
func os_sigpipe() {
Expand Down
Loading

0 comments on commit 334f83e

Please sign in to comment.