diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
index b39b874bcaac..69bbf27633fd 100644
--- a/docs/generated/settings/settings.html
+++ b/docs/generated/settings/settings.html
@@ -76,7 +76,6 @@
server.goroutine_dump.num_goroutines_threshold | integer | 1000 | a threshold beyond which if number of goroutines increases, then goroutine dump can be triggered |
server.goroutine_dump.total_dump_size_limit | byte size | 500 MiB | total size of goroutine dumps to be kept. Dumps are GC'ed in the order of creation time. The latest dump is always kept even if its size exceeds the limit. |
server.heap_profile.max_profiles | integer | 5 | maximum number of profiles to be kept. Profiles with lower score are GC'ed, but latest profile is always kept. |
-server.heap_profile.system_memory_threshold_fraction | float | 0.85 | fraction of system memory beyond which if Rss increases, then heap profile is triggered |
server.host_based_authentication.configuration | string |
| host-based authentication configuration to use during connection authentication |
server.rangelog.ttl | duration | 720h0m0s | if nonzero, range log entries older than this duration are deleted every 10m0s. Should not be lowered below 24 hours. |
server.remote_debugging.mode | string | local | set to enable remote debugging, localhost-only or disable (any, local, off) |
diff --git a/pkg/base/constants.go b/pkg/base/constants.go
index 4aeb2845f6d2..4a0db40a6fba 100644
--- a/pkg/base/constants.go
+++ b/pkg/base/constants.go
@@ -37,4 +37,8 @@ const (
// ChunkRaftCommandThresholdBytes is the threshold in bytes at which
// to chunk or otherwise limit commands being sent to Raft.
ChunkRaftCommandThresholdBytes = 256 * 1000
+
+ // HeapProfileDir is the directory name where the heap profiler stores profiles
+ // when there is a potential OOM situation.
+ HeapProfileDir = "heap_profiler"
)
diff --git a/pkg/cli/start.go b/pkg/cli/start.go
index c359ea49f94d..ee7c9d35addb 100644
--- a/pkg/cli/start.go
+++ b/pkg/cli/start.go
@@ -448,7 +448,9 @@ func runStart(cmd *cobra.Command, args []string) error {
}
serverCfg.GoroutineDumpDirName = logOutputDirectory()
- serverCfg.HeapProfileDirName = logOutputDirectory()
+
+ heapProfileDir := filepath.Join(logOutputDirectory(), base.HeapProfileDir)
+ serverCfg.HeapProfileDirName = heapProfileDir
// We don't care about GRPCs fairly verbose logs in most client commands,
// but when actually starting a server, we enable them.
grpcutil.SetSeverity(log.Severity_WARNING)
diff --git a/pkg/server/config.go b/pkg/server/config.go
index 6315614b62c4..c2fa1396317c 100644
--- a/pkg/server/config.go
+++ b/pkg/server/config.go
@@ -179,8 +179,9 @@ type Config struct {
GoroutineDumpDirName string
// HeapProfileDirName is the directory name for heap profiles using
- // heapprofiler.
+ // heapprofiler. If empty, no heap profiles will be collected.
HeapProfileDirName string
+
// Parsed values.
// NodeAttributes is the parsed representation of Attrs.
diff --git a/pkg/server/heapprofiler/heapprofiler.go b/pkg/server/heapprofiler/heapprofiler.go
index 9302147870eb..0dfc33dd9254 100644
--- a/pkg/server/heapprofiler/heapprofiler.go
+++ b/pkg/server/heapprofiler/heapprofiler.go
@@ -20,6 +20,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
+ "runtime"
"runtime/pprof"
"strings"
"time"
@@ -31,15 +32,11 @@ import (
"github.com/pkg/errors"
)
-const minProfileInterval = time.Minute
+// resetHighWaterMarkInterval specifies how often the high-water mark value will
+// be reset. Immediately after it is reset, a new profile will be taken.
+const resetHighWaterMarkInterval = time.Hour
var (
- systemMemoryThresholdFraction = settings.RegisterFloatSetting(
- "server.heap_profile.system_memory_threshold_fraction",
- "fraction of system memory beyond which if Rss increases, "+
- "then heap profile is triggered",
- .85,
- )
maxProfiles = settings.RegisterIntSetting(
"server.heap_profile.max_profiles",
"maximum number of profiles to be kept. "+
@@ -48,99 +45,80 @@ var (
)
)
-type stats struct {
- rssBytes int64
- systemMemoryBytes int64
- lastProfileTime time.Time
- aboveSysMemThresholdSinceLastProfile bool
- currentTime func() time.Time
+type testingKnobs struct {
+ dontWriteProfiles bool
+ maybeTakeProfileHook func(willTakeProfile bool)
+ now func() time.Time
}
-type heuristic struct {
- name string
- isTrue func(s *stats, st *cluster.Settings) (score int64, isTrue bool)
-}
+// HeapProfiler is used to take heap profiles.
+//
+// MaybeTakeProfile() is supposed to be called periodically. A profile is taken
+// every time Go heap allocated bytes exceeds the previous high-water mark. The
+// recorded high-water mark is also reset periodically, so that we take some
+// profiles periodically.
+// Profiles are also GCed periodically. The latest is always kept, and a couple
+// of the ones with the largest heap are also kept.
+type HeapProfiler struct {
+ dir string
+ st *cluster.Settings
+ // lastProfileTime marks the time when we took the last profile.
+ lastProfileTime time.Time
+ // highwaterMarkBytes represents the maximum heap size that we've seen since
+ // resetting the filed (which happens periodically).
+ highwaterMarkBytes uint64
-// fractionSystemMemoryHeuristic is true if latest Rss is more than
-// systemMemoryThresholdFraction of system memory. No new profile is
-// taken if Rss has been above threshold since the last time profile was taken,
-// but a new profile will be triggered if Rss has dipped below threshold since
-// the last profile. score is the latest value of Rss.
-// At max one profile will be taken in minProfileInterval.
-var fractionSystemMemoryHeuristic = heuristic{
- name: "fraction_system_memory",
- isTrue: func(s *stats, st *cluster.Settings) (score int64, isTrue bool) {
- currentValue := s.rssBytes
- if float64(currentValue)/float64(s.systemMemoryBytes) > systemMemoryThresholdFraction.Get(&st.SV) {
- if s.currentTime().Sub(s.lastProfileTime) < minProfileInterval ||
- s.aboveSysMemThresholdSinceLastProfile {
- return 0, false
- }
- s.aboveSysMemThresholdSinceLastProfile = true
- return currentValue, true
- }
- s.aboveSysMemThresholdSinceLastProfile = false
- return 0, false
- },
+ knobs testingKnobs
}
-// HeapProfiler is used to take heap profiles if an OOM situation is
-// detected. It stores relevant functions and stats for heuristics to use.
-type HeapProfiler struct {
- *stats
- heuristics []heuristic
- takeHeapProfile func(ctx context.Context, dir string, prefix string, suffix string)
- gcProfiles func(ctx context.Context, dir, prefix string, maxCount int64)
- dir string
+// NewHeapProfiler creates a HeapProfiler. dir is the directory in which
+// profiles are to be stored.
+func NewHeapProfiler(dir string, st *cluster.Settings) (*HeapProfiler, error) {
+ if dir == "" {
+ return nil, errors.Errorf("need to specify dir for NewHeapProfiler")
+ }
+ hp := &HeapProfiler{
+ dir: dir,
+ st: st,
+ }
+ return hp, nil
}
-const memprof = "memprof."
-
-// MaybeTakeProfile takes a heap profile if an OOM situation is detected using
-// heuristics enabled in o. At max one profile is taken in a call of this
-// function. This function is also responsible for updating stats in o.
-func (o *HeapProfiler) MaybeTakeProfile(ctx context.Context, st *cluster.Settings, rssBytes int64) {
- o.rssBytes = rssBytes
- profileTaken := false
- for _, h := range o.heuristics {
- if score, isTrue := h.isTrue(o.stats, st); isTrue {
- if !profileTaken {
- prefix := memprof + h.name + "."
- const format = "2006-01-02T15_04_05.999"
- suffix := fmt.Sprintf("%018d_%s", score, o.currentTime().Format(format))
- o.takeHeapProfile(ctx, o.dir, prefix, suffix)
- o.lastProfileTime = o.currentTime()
- profileTaken = true
- if o.gcProfiles != nil {
- o.gcProfiles(ctx, o.dir, memprof, maxProfiles.Get(&st.SV))
- }
- }
- }
+// MaybeTakeProfile takes a heap profile if the heap is big enough.
+func (o *HeapProfiler) MaybeTakeProfile(ctx context.Context, ms runtime.MemStats) {
+ // If it's been too long since we took a profile, make sure we'll take one now.
+ if o.now().Sub(o.lastProfileTime) > resetHighWaterMarkInterval {
+ o.highwaterMarkBytes = 0
}
-}
-// NewHeapProfiler returns a HeapProfiler which has
-// systemMemoryThresholdFraction heuristic enabled. dir is the directory in
-// which profiles are stored.
-func NewHeapProfiler(dir string, systemMemoryBytes int64) (*HeapProfiler, error) {
- if dir == "" {
- return nil, errors.New("directory to store profiles could not be determined")
+ curHeap := ms.HeapAlloc
+ takeProfile := curHeap > o.highwaterMarkBytes
+ if hook := o.knobs.maybeTakeProfileHook; hook != nil {
+ hook(takeProfile)
}
- dir = filepath.Join(dir, "heap_profiler")
- if err := os.MkdirAll(dir, 0755); err != nil {
- return nil, err
+ if !takeProfile {
+ return
}
- hp := &HeapProfiler{
- stats: &stats{
- systemMemoryBytes: systemMemoryBytes,
- currentTime: timeutil.Now,
- },
- heuristics: []heuristic{fractionSystemMemoryHeuristic},
- takeHeapProfile: takeHeapProfile,
- gcProfiles: gcProfiles,
- dir: dir,
+
+ o.highwaterMarkBytes = curHeap
+ o.lastProfileTime = o.now()
+
+ if o.knobs.dontWriteProfiles {
+ return
}
- return hp, nil
+ const format = "2006-01-02T15_04_05.999"
+ filePrefix := "memprof."
+ fileName := fmt.Sprintf("%s%018d_%s", filePrefix, curHeap, o.now().Format(format))
+ path := filepath.Join(o.dir, fileName)
+ takeHeapProfile(ctx, path)
+ o.gcProfiles(ctx, o.dir, filePrefix)
+}
+
+func (o *HeapProfiler) now() time.Time {
+ if o.knobs.now != nil {
+ return o.knobs.now()
+ }
+ return timeutil.Now()
}
// gcProfiles removes least score profile matching the specified prefix when the
@@ -148,7 +126,8 @@ func NewHeapProfiler(dir string, systemMemoryBytes int64) (*HeapProfiler, error)
// the profiles indicates score such that sorting the filenames corresponds to
// ordering the profiles from least to max score.
// Latest profile in the directory is not considered for GC.
-func gcProfiles(ctx context.Context, dir, prefix string, maxCount int64) {
+func (o *HeapProfiler) gcProfiles(ctx context.Context, dir, prefix string) {
+ maxCount := maxProfiles.Get(&o.st.SV)
files, err := ioutil.ReadDir(dir)
if err != nil {
log.Warning(ctx, err)
@@ -184,8 +163,7 @@ func gcProfiles(ctx context.Context, dir, prefix string, maxCount int64) {
}
}
-func takeHeapProfile(ctx context.Context, dir string, prefix string, suffix string) {
- path := filepath.Join(dir, prefix+suffix)
+func takeHeapProfile(ctx context.Context, path string) {
// Try writing a go heap profile.
f, err := os.Create(path)
if err != nil {
diff --git a/pkg/server/heapprofiler/heapprofiler_test.go b/pkg/server/heapprofiler/heapprofiler_test.go
index 6bdbe9e14b41..b6677d3c2abd 100644
--- a/pkg/server/heapprofiler/heapprofiler_test.go
+++ b/pkg/server/heapprofiler/heapprofiler_test.go
@@ -16,84 +16,54 @@ package heapprofiler
import (
"context"
- "strconv"
- "strings"
+ "runtime"
"testing"
"time"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
- "github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/stretchr/testify/assert"
)
-type rssVal struct {
- secs time.Duration // secs is the time at which this rss value was emitted
- rss int64
-}
+func TestHeapProfiler(t *testing.T) {
+ ctx := context.Background()
+ type test struct {
+ secs int // The measurement's timestamp.
+ heapBytes uint64
-func testHelper(
- t *testing.T,
- hp *HeapProfiler,
- st *cluster.Settings,
- rssValues []rssVal,
- expectedScores []int64,
- expectedPrefixes []string,
-) {
- baseTime := time.Time{}
- numProfiles := 0
- mockHeapProfile := func(
- ctx context.Context, dir string, prefix string, suffix string,
- ) {
- assert.Equal(t, prefix, expectedPrefixes[numProfiles])
- score, err := strconv.ParseInt(strings.Split(suffix, "_")[0], 10, 64)
- assert.Nil(t, err)
- assert.Equal(t, expectedScores[numProfiles], score)
- numProfiles++
+ expProfile bool
+ }
+ tests := []test{
+ {0, 30, true}, // we always take the first profile
+ {10, 40, true}, // new high-water mark
+ {20, 30, false}, // below high-water mark; no profile
+ {4000, 10, true}, // new hour; should trigger regardless of the usage
}
var currentTime time.Time
- var timeLocker syncutil.RWMutex
now := func() time.Time {
- timeLocker.RLock()
- defer timeLocker.RUnlock()
return currentTime
}
- hp.takeHeapProfile = mockHeapProfile
- hp.currentTime = now
- // set a large negative time so that first profile is triggered correctly
- // since we start time from 0 in test.
- // Not needed in main code as time will never be 0.
- hp.lastProfileTime = time.Time{}.Add(-1000 * time.Second)
- ctx := context.TODO()
- for _, r := range rssValues {
- currentTime = baseTime.Add(time.Second * r.secs)
- hp.MaybeTakeProfile(ctx, st, r.rss)
+ var tookProfile bool
+ hp, err := NewHeapProfiler("dummy_dir", cluster.MakeTestingClusterSettings())
+ if err != nil {
+ t.Fatal(err)
}
- assert.Equal(t, numProfiles, len(expectedScores))
-}
-
-func TestPercentSystemMemoryHeuristic(t *testing.T) {
- rssValues := []rssVal{
- {0, 30}, {20, 40}, // random small values
- {30, 88}, // should trigger
- {80, 89}, // should not trigger as less than 60s before last profile
- {130, 10}, {140, 4}, // random small values
- {150, 90}, // should trigger
- {260, 92}, // should not trigger as continues above threshold
- {290, 30}, // random small value
- {380, 99}, // should trigger
- {390, 30}, // random small value
- {430, 91}, // should not trigger as less than 60s before last profile
- {500, 95}, // should trigger
+ hp.knobs = testingKnobs{
+ now: now,
+ dontWriteProfiles: true,
+ maybeTakeProfileHook: func(willTakeProfile bool) {
+ tookProfile = willTakeProfile
+ },
}
- expectedScores := []int64{88, 90, 99, 95}
- prefix := "memprof.fraction_system_memory."
- expectedPrefixes := []string{prefix, prefix, prefix, prefix}
- hp := &HeapProfiler{
- stats: &stats{systemMemoryBytes: 100},
- heuristics: []heuristic{fractionSystemMemoryHeuristic},
+
+ for i, r := range tests {
+ currentTime = (time.Time{}).Add(time.Second * time.Duration(r.secs))
+
+ // Initialize enough of ms for the purposes of the HeapProfiler.
+ var ms runtime.MemStats
+ ms.HeapAlloc = r.heapBytes
+
+ hp.MaybeTakeProfile(ctx, ms)
+ assert.Equal(t, r.expProfile, tookProfile, i)
}
- st := &cluster.Settings{}
- systemMemoryThresholdFraction.Override(&st.SV, .85)
- testHelper(t, hp, st, rssValues, expectedScores, expectedPrefixes)
}
diff --git a/pkg/server/server.go b/pkg/server/server.go
index e7caf66b1458..918621a26abb 100644
--- a/pkg/server/server.go
+++ b/pkg/server/server.go
@@ -26,6 +26,7 @@ import (
"net/http"
"os"
"path/filepath"
+ "runtime"
"strings"
"sync"
"sync/atomic"
@@ -1548,7 +1549,7 @@ func (s *Server) Start(ctx context.Context) error {
s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt, s.cfg.AdvertiseAddr, s.cfg.HTTPAdvertiseAddr)
// Begin recording runtime statistics.
- s.startSampleEnvironment(DefaultMetricsSampleInterval)
+ s.startSampleEnvironment(ctx, DefaultMetricsSampleInterval)
// Begin recording time series data collected by the status monitor.
s.tsDB.PollSource(
@@ -1904,68 +1905,92 @@ func (s *Server) Decommission(ctx context.Context, setTo bool, nodeIDs []roachpb
return nil
}
-// startSampleEnvironment begins the heap profiler worker and a worker that
-// periodically instructs the runtime stat sampler to sample the environment.
-func (s *Server) startSampleEnvironment(frequency time.Duration) {
+// startSampleEnvironment begins the heap profiler worker.
+func (s *Server) startSampleEnvironment(ctx context.Context, frequency time.Duration) {
// Immediately record summaries once on server startup.
- ctx := s.AnnotateCtx(context.Background())
+ ctx = s.AnnotateCtx(ctx)
goroutineDumper, err := goroutinedumper.NewGoroutineDumper(s.cfg.GoroutineDumpDirName)
if err != nil {
log.Infof(ctx, "Could not start goroutine dumper worker due to: %s", err)
}
- var heapProfiler *heapprofiler.HeapProfiler
- {
- systemMemoryBytes, err := status.GetTotalMemory(ctx)
+ // We're not going to take heap profiles if running with in-memory stores.
+ // This helps some tests that can't write any files.
+ allStoresInMem := true
+ for _, storeSpec := range s.cfg.Stores.Specs {
+ if !storeSpec.InMemory {
+ allStoresInMem = false
+ break
+ }
+ }
+
+ var heapProfiler *heapprofiler.HeapProfiler
+ if s.cfg.HeapProfileDirName != "" && !allStoresInMem {
+ if err := os.MkdirAll(s.cfg.HeapProfileDirName, 0755); err != nil {
+ log.Fatalf(ctx, "Could not create heap profiles dir: %s", err)
+ }
+ heapProfiler, err = heapprofiler.NewHeapProfiler(
+ s.cfg.HeapProfileDirName, s.ClusterSettings())
if err != nil {
- log.Warningf(ctx, "Could not compute system memory due to: %s", err)
- } else {
- heapProfiler, err = heapprofiler.NewHeapProfiler(s.cfg.HeapProfileDirName, systemMemoryBytes)
- if err != nil {
- log.Infof(ctx, "Could not start heap profiler worker due to: %s", err)
- }
+ log.Fatalf(ctx, "Could not start heap profiler worker due to: %s", err)
}
}
- // We run two separate sampling loops, one for memory stats (via
- // ReadMemStats) and one for all other runtime stats. This is necessary
- // because as of go1.11, runtime.ReadMemStats() "stops the world" and
- // requires waiting for any current GC run to finish. With a large heap, a
- // single GC run may take longer than the default sampling period (10s).
s.stopper.RunWorker(ctx, func(ctx context.Context) {
+ var goMemStats atomic.Value // *status.GoMemStats
+ goMemStats.Store(&status.GoMemStats{})
+ var collectingMemStats int32 // atomic, 1 when stats call is ongoing
+
timer := timeutil.NewTimer()
defer timer.Stop()
timer.Reset(frequency)
+
for {
select {
- case <-timer.C:
- timer.Read = true
- s.runtime.SampleMemStats(ctx)
- timer.Reset(frequency)
case <-s.stopper.ShouldStop():
return
- }
- }
- })
-
- s.stopper.RunWorker(ctx, func(ctx context.Context) {
- timer := timeutil.NewTimer()
- defer timer.Stop()
- timer.Reset(frequency)
- for {
- select {
case <-timer.C:
timer.Read = true
- s.runtime.SampleEnvironment(ctx)
+ timer.Reset(frequency)
+
+ // We read the heap stats on another goroutine and give up after 1s.
+ // This is necessary because as of Go 1.12, runtime.ReadMemStats()
+ // "stops the world" and that requires first waiting for any current GC
+ // run to finish. With a large heap and under extreme conditions, a
+ // single GC run may take longer than the default sampling period of
+ // 10s. Under normal operations and with more recent versions of Go,
+ // this hasn't been observed to be a problem.
+ statsCollected := make(chan struct{})
+ if atomic.CompareAndSwapInt32(&collectingMemStats, 0, 1) {
+ if err := s.stopper.RunAsyncTask(ctx, "get-mem-stats", func(ctx context.Context) {
+ var ms status.GoMemStats
+ runtime.ReadMemStats(&ms.MemStats)
+ ms.Collected = timeutil.Now()
+ log.VEventf(ctx, 2, "memstats: %+v", ms)
+
+ goMemStats.Store(&ms)
+ atomic.StoreInt32(&collectingMemStats, 0)
+ close(statsCollected)
+ }); err != nil {
+ close(statsCollected)
+ }
+ }
+
+ select {
+ case <-statsCollected:
+ // Good; we managed to read the Go memory stats quickly enough.
+ case <-time.After(time.Second):
+ }
+
+ curStats := goMemStats.Load().(*status.GoMemStats)
+ s.runtime.SampleEnvironment(ctx, *curStats)
if goroutineDumper != nil {
goroutineDumper.MaybeDump(ctx, s.ClusterSettings(), s.runtime.Goroutines.Value())
}
if heapProfiler != nil {
- heapProfiler.MaybeTakeProfile(ctx, s.ClusterSettings(), s.runtime.RSSBytes.Value())
+ heapProfiler.MaybeTakeProfile(ctx, curStats.MemStats)
}
- timer.Reset(frequency)
- case <-s.stopper.ShouldStop():
- return
+
}
}
})
diff --git a/pkg/server/status.go b/pkg/server/status.go
index 5dc1863715ce..6cfec7194ec5 100644
--- a/pkg/server/status.go
+++ b/pkg/server/status.go
@@ -95,10 +95,6 @@ const (
// permitted in responses.
omittedKeyStr = "omitted (due to the 'server.remote_debugging.mode' setting)"
- // heapDir is the directory name where the heap profiler stores profiles
- // when there is a potential OOM situation.
- heapDir = "heap_profiler"
-
// goroutineDir is the directory name where the goroutinedumper stores
// goroutine dumps.
goroutinesDir = "goroutine_dump"
@@ -681,7 +677,7 @@ func (s *statusServer) GetFiles(
//TODO(ridwanmsharif): Serve logfiles so debug-zip can fetch them
// intead of reading indididual entries.
case serverpb.FileType_HEAP: // Requesting for saved Heap Profiles.
- dir = filepath.Join(s.admin.server.cfg.HeapProfileDirName, heapDir)
+ dir = filepath.Join(s.admin.server.cfg.HeapProfileDirName, base.HeapProfileDir)
case serverpb.FileType_GOROUTINES: // Requesting for saved Goroutine dumps.
dir = filepath.Join(s.admin.server.cfg.GoroutineDumpDirName, goroutinesDir)
default:
diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go
index 35cf2d95e226..ffab35d6ef2a 100644
--- a/pkg/server/status/runtime.go
+++ b/pkg/server/status/runtime.go
@@ -19,15 +19,14 @@ import (
"os"
"runtime"
"runtime/debug"
- "sync/atomic"
"time"
- "unsafe"
"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/metric"
- humanize "github.com/dustin/go-humanize"
+ "github.com/cockroachdb/cockroach/pkg/util/timeutil"
+ "github.com/dustin/go-humanize"
"github.com/elastic/gosigar"
"github.com/shirou/gopsutil/net"
)
@@ -226,12 +225,6 @@ var (
}
)
-type memStats struct {
- goAllocated uint64
- goIdle uint64
- goTotal uint64
-}
-
// getCgoMemStats is a function that fetches stats for the C++ portion of the code.
// We will not necessarily have implementations for all builds, so check for nil first.
// Returns the following:
@@ -261,9 +254,6 @@ type RuntimeStatSampler struct {
net net.IOCountersStat
}
- // Memory stats that are updated atomically by SampleMemStats.
- memStats unsafe.Pointer
-
initialDiskCounters diskStats
initialNetCounters net.IOCountersStat
@@ -387,6 +377,14 @@ func NewRuntimeStatSampler(ctx context.Context, clock *hlc.Clock) *RuntimeStatSa
return rsr
}
+// GoMemStats groups a runtime.MemStats structure with the timestamp when it
+// was collected.
+type GoMemStats struct {
+ runtime.MemStats
+ // Collected is the timestamp at which these values were collected.
+ Collected time.Time
+}
+
// SampleEnvironment queries the runtime system for various interesting metrics,
// storing the resulting values in the set of metric gauges maintained by
// RuntimeStatSampler. This makes runtime statistics more convenient for
@@ -394,7 +392,10 @@ func NewRuntimeStatSampler(ctx context.Context, clock *hlc.Clock) *RuntimeStatSa
//
// This method should be called periodically by a higher level system in order
// to keep runtime statistics current.
-func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context) {
+//
+// SampleEnvironment takes GoMemStats as input because that is collected
+// separately, on a different schedule.
+func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context, ms GoMemStats) {
// Note that debug.ReadGCStats() does not suffer the same problem as
// runtime.ReadMemStats(). The only way you can know that is by reading the
// source.
@@ -490,18 +491,20 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context) {
}
}
- ms := (*memStats)(atomic.LoadPointer(&rsr.memStats))
- if ms == nil {
- ms = &memStats{}
- }
-
// Log summary of statistics to console.
cgoRate := float64((numCgoCall-rsr.last.cgoCall)*int64(time.Second)) / dur
- log.Infof(ctx, "runtime stats: %s RSS, %d goroutines, %s/%s/%s GO alloc/idle/total, "+
+ goMemStatsStale := timeutil.Now().Sub(ms.Collected) > time.Second
+ var staleMsg = ""
+ if goMemStatsStale {
+ staleMsg = "(stale)"
+ }
+ goTotal := ms.Sys - ms.HeapReleased
+ log.Infof(ctx, "runtime stats: %s RSS, %d goroutines, %s/%s/%s GO alloc/idle/total%s, "+
"%s/%s CGO alloc/total, %.1f CGO/sec, %.1f/%.1f %%(u/s)time, %.1f %%gc (%dx), "+
"%s/%s (r/w)net",
humanize.IBytes(mem.Resident), numGoroutine,
- humanize.IBytes(ms.goAllocated), humanize.IBytes(ms.goIdle), humanize.IBytes(ms.goTotal),
+ humanize.IBytes(ms.HeapAlloc), humanize.IBytes(ms.HeapIdle), humanize.IBytes(goTotal),
+ staleMsg,
humanize.IBytes(uint64(cgoAllocated)), humanize.IBytes(uint64(cgoTotal)),
cgoRate, 100*uPerc, 100*sPerc, 100*gcPausePercent, gc.NumGC-rsr.last.gcCount,
humanize.IBytes(deltaNet.BytesRecv), humanize.IBytes(deltaNet.BytesSent),
@@ -509,6 +512,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context) {
rsr.last.cgoCall = numCgoCall
rsr.last.gcCount = gc.NumGC
+ rsr.GoAllocBytes.Update(int64(ms.HeapAlloc))
+ rsr.GoTotalBytes.Update(int64(goTotal))
rsr.CgoCalls.Update(numCgoCall)
rsr.Goroutines.Update(int64(numGoroutine))
rsr.CgoAllocBytes.Update(int64(cgoAllocated))
@@ -527,44 +532,6 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context) {
rsr.Uptime.Update((now - rsr.startTimeNanos) / 1e9)
}
-// SampleMemStats queries the runtime system for memory metrics, updating the
-// memory metric gauges and making these metrics available for logging by
-// SampleEnvironment.
-//
-// This method should be called periodically by a higher level system in order
-// to keep runtime statistics current. It is distinct from SampleEnvironment
-// due to a limitation in the Go runtime which causes runtime.ReadMemStats() to
-// block waiting for a GC cycle to finish which can take upwards of 10 seconds
-// on a large heap.
-func (rsr *RuntimeStatSampler) SampleMemStats(ctx context.Context) {
- // Record memory and call stats from the runtime package. It might be useful
- // to call ReadMemStats() more often, but it stops the world while collecting
- // stats so shouldn't be called too often. For a similar reason, we want this
- // call to come first because ReadMemStats() needs to wait for an existing GC
- // to finish which can take multiple seconds on large heaps.
- //
- // NOTE: the MemStats fields do not get decremented when memory is released,
- // to get accurate numbers, be sure to subtract. eg: ms.Sys - ms.HeapReleased
- // for current memory reserved.
- ms := &runtime.MemStats{}
- runtime.ReadMemStats(ms)
-
- goAllocated := ms.Alloc
- goTotal := ms.Sys - ms.HeapReleased
- atomic.StorePointer(&rsr.memStats, unsafe.Pointer(&memStats{
- goAllocated: goAllocated,
- goIdle: ms.HeapIdle - ms.HeapReleased,
- goTotal: goTotal,
- }))
-
- rsr.GoAllocBytes.Update(int64(goAllocated))
- rsr.GoTotalBytes.Update(int64(goTotal))
-
- if log.V(2) {
- log.Infof(ctx, "memstats: %+v", ms)
- }
-}
-
// GetCPUCombinedPercentNorm is part of the distsqlrun.RuntimeStats interface.
func (rsr *RuntimeStatSampler) GetCPUCombinedPercentNorm() float64 {
return rsr.CPUCombinedPercentNorm.Value()
diff --git a/pkg/server/status_test.go b/pkg/server/status_test.go
index 786c5bfc29d0..c48ad6173415 100644
--- a/pkg/server/status_test.go
+++ b/pkg/server/status_test.go
@@ -20,6 +20,7 @@ import (
"fmt"
"io/ioutil"
"math"
+ "os"
"path/filepath"
"reflect"
"regexp"
@@ -288,7 +289,11 @@ func TestStatusGetFiles(t *testing.T) {
t.Run("heap", func(t *testing.T) {
const testFilesNo = 3
for i := 0; i < testFilesNo; i++ {
- testHeapFile := filepath.Join(storeSpec.Path, "logs", heapDir, fmt.Sprintf("heap%d.pprof", i))
+ testHeapDir := filepath.Join(storeSpec.Path, "logs", base.HeapProfileDir)
+ testHeapFile := filepath.Join(testHeapDir, fmt.Sprintf("heap%d.pprof", i))
+ if err := os.MkdirAll(testHeapDir, os.ModePerm); err != nil {
+ t.Fatal(err)
+ }
if err := ioutil.WriteFile(testHeapFile, []byte(fmt.Sprintf("I'm heap file %d", i)), 0644); err != nil {
t.Fatal(err)
}
diff --git a/pkg/settings/registry.go b/pkg/settings/registry.go
index 4beadedd9452..b7d82249c27a 100644
--- a/pkg/settings/registry.go
+++ b/pkg/settings/registry.go
@@ -44,8 +44,9 @@ var retiredSettings = map[string]struct{}{
// removed as of 2.2.
"kv.raft_log.synchronize": {},
// removed as of 19.2.
- "schemachanger.bulk_index_backfill.enabled": {},
- "rocksdb.ingest_backpressure.delay_l0_file": {}, // never used
+ "schemachanger.bulk_index_backfill.enabled": {},
+ "rocksdb.ingest_backpressure.delay_l0_file": {}, // never used
+ "server.heap_profile.system_memory_threshold_fraction": {},
}
// Register adds a setting to the registry.