From 358cb6ed48fa8749f20d3815214cdee6d3ac08a0 Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Fri, 29 Nov 2024 12:52:56 +0100 Subject: [PATCH] MemPostings: keep a map of label values slices (#15426) While investigating lock contention on `MemPostings`, we saw that lots of locking is happening in `LabelValues` and `PostingsForLabelsMatching`, both copying the label values slices while holding the mutex. This adds an extra map that holds an append-only label values slice for each one of the label names. Since the slice is append-only, it can be copied without holding the mutex. Signed-off-by: Oleg Zaytsev --- tsdb/index/postings.go | 169 +++++++++++++++++++++++++---------------- 1 file changed, 104 insertions(+), 65 deletions(-) diff --git a/tsdb/index/postings.go b/tsdb/index/postings.go index 8b55954d0..67c8f53bf 100644 --- a/tsdb/index/postings.go +++ b/tsdb/index/postings.go @@ -18,6 +18,7 @@ import ( "context" "encoding/binary" "fmt" + "maps" "math" "runtime" "slices" @@ -32,6 +33,8 @@ import ( "github.com/prometheus/prometheus/storage" ) +const exponentialSliceGrowthFactor = 2 + var allPostingsKey = labels.Label{} // AllPostingsKey returns the label key that is used to store the postings list of all existing IDs. @@ -55,15 +58,33 @@ var ensureOrderBatchPool = sync.Pool{ // EnsureOrder() must be called once before any reads are done. This allows for quick // unordered batch fills on startup. type MemPostings struct { - mtx sync.RWMutex - m map[string]map[string][]storage.SeriesRef + mtx sync.RWMutex + + // m holds the postings lists for each label-value pair, indexed first by label name, and then by label value. + // + // mtx must be held when interacting with m (the appropriate one for reading or writing). + // It is safe to retain a reference to a postings list after releasing the lock. + // + // BUG: There's currently a data race in addFor, which might modify the tail of the postings list: + // https://github.com/prometheus/prometheus/issues/15317 + m map[string]map[string][]storage.SeriesRef + + // lvs holds the label values for each label name. + // lvs[name] is essentially an unsorted append-only list of all keys in m[name] + // mtx must be held when interacting with lvs. + // Since it's append-only, it is safe to the label values slice after releasing the lock. + lvs map[string][]string + ordered bool } +const defaultLabelNamesMapSize = 512 + // NewMemPostings returns a memPostings that's ready for reads and writes. func NewMemPostings() *MemPostings { return &MemPostings{ - m: make(map[string]map[string][]storage.SeriesRef, 512), + m: make(map[string]map[string][]storage.SeriesRef, defaultLabelNamesMapSize), + lvs: make(map[string][]string, defaultLabelNamesMapSize), ordered: true, } } @@ -72,7 +93,8 @@ func NewMemPostings() *MemPostings { // until EnsureOrder() was called once. func NewUnorderedMemPostings() *MemPostings { return &MemPostings{ - m: make(map[string]map[string][]storage.SeriesRef, 512), + m: make(map[string]map[string][]storage.SeriesRef, defaultLabelNamesMapSize), + lvs: make(map[string][]string, defaultLabelNamesMapSize), ordered: false, } } @@ -80,16 +102,19 @@ func NewUnorderedMemPostings() *MemPostings { // Symbols returns an iterator over all unique name and value strings, in order. func (p *MemPostings) Symbols() StringIter { p.mtx.RLock() + // Make a quick clone of the map to avoid holding the lock while iterating. + // It's safe to use the values of the map after releasing the lock, as they're append-only slices. + lvs := maps.Clone(p.lvs) + p.mtx.RUnlock() // Add all the strings to a map to de-duplicate. - symbols := make(map[string]struct{}, 512) - for n, e := range p.m { + symbols := make(map[string]struct{}, defaultLabelNamesMapSize) + for n, labelValues := range lvs { symbols[n] = struct{}{} - for v := range e { + for _, v := range labelValues { symbols[v] = struct{}{} } } - p.mtx.RUnlock() res := make([]string, 0, len(symbols)) for k := range symbols { @@ -145,13 +170,14 @@ func (p *MemPostings) LabelNames() []string { // LabelValues returns label values for the given name. func (p *MemPostings) LabelValues(_ context.Context, name string) []string { p.mtx.RLock() - defer p.mtx.RUnlock() + values := p.lvs[name] + p.mtx.RUnlock() - values := make([]string, 0, len(p.m[name])) - for v := range p.m[name] { - values = append(values, v) - } - return values + // The slice from p.lvs[name] is shared between all readers, and it is append-only. + // Since it's shared, we need to make a copy of it before returning it to make + // sure that no caller modifies the original one by sorting it or filtering it. + // Since it's append-only, we can do this while not holding the mutex anymore. + return slices.Clone(values) } // PostingsStats contains cardinality based statistics for postings. @@ -294,6 +320,7 @@ func (p *MemPostings) Delete(deleted map[storage.SeriesRef]struct{}, affected ma p.mtx.Lock() defer p.mtx.Unlock() + affectedLabelNames := map[string]struct{}{} process := func(l labels.Label) { orig := p.m[l.Name][l.Value] repl := make([]storage.SeriesRef, 0, len(orig)) @@ -306,10 +333,7 @@ func (p *MemPostings) Delete(deleted map[storage.SeriesRef]struct{}, affected ma p.m[l.Name][l.Value] = repl } else { delete(p.m[l.Name], l.Value) - // Delete the key if we removed all values. - if len(p.m[l.Name]) == 0 { - delete(p.m, l.Name) - } + affectedLabelNames[l.Name] = struct{}{} } } @@ -323,22 +347,52 @@ func (p *MemPostings) Delete(deleted map[storage.SeriesRef]struct{}, affected ma // Note that a read query will most likely want to read multiple postings lists, say 5, 10 or 20 (depending on the number of matchers) // And that read query will most likely evaluate only one of those matchers before we unpause here, so we want to pause often. if i%512 == 0 { - p.mtx.Unlock() - // While it's tempting to just do a `time.Sleep(time.Millisecond)` here, - // it wouldn't ensure use that readers actually were able to get the read lock, - // because if there are writes waiting on same mutex, readers won't be able to get it. - // So we just grab one RLock ourselves. - p.mtx.RLock() - // We shouldn't wait here, because we would be blocking a potential write for no reason. - // Note that if there's a writer waiting for us to unlock, no reader will be able to get the read lock. - p.mtx.RUnlock() //nolint:staticcheck // SA2001: this is an intentionally empty critical section. - // Now we can wait a little bit just to increase the chance of a reader getting the lock. - // If we were deleting 100M series here, pausing every 512 with 1ms sleeps would be an extra of 200s, which is negligible. - time.Sleep(time.Millisecond) - p.mtx.Lock() + p.unlockWaitAndLockAgain() } } process(allPostingsKey) + + // Now we need to update the label values slices. + i = 0 + for name := range affectedLabelNames { + i++ + // From time to time we want some readers to go through and read their postings. + if i%512 == 0 { + p.unlockWaitAndLockAgain() + } + + if len(p.m[name]) == 0 { + // Delete the label name key if we deleted all values. + delete(p.m, name) + delete(p.lvs, name) + continue + } + + // Create the new slice with enough room to grow without reallocating. + // We have deleted values here, so there's definitely some churn, so be prepared for it. + lvs := make([]string, 0, exponentialSliceGrowthFactor*len(p.m[name])) + for v := range p.m[name] { + lvs = append(lvs, v) + } + p.lvs[name] = lvs + } +} + +// unlockWaitAndLockAgain will unlock an already locked p.mtx.Lock() and then wait a little bit before locking it again, +// letting the RLock()-waiting goroutines to get the lock. +func (p *MemPostings) unlockWaitAndLockAgain() { + p.mtx.Unlock() + // While it's tempting to just do a `time.Sleep(time.Millisecond)` here, + // it wouldn't ensure use that readers actually were able to get the read lock, + // because if there are writes waiting on same mutex, readers won't be able to get it. + // So we just grab one RLock ourselves. + p.mtx.RLock() + // We shouldn't wait here, because we would be blocking a potential write for no reason. + // Note that if there's a writer waiting for us to unlock, no reader will be able to get the read lock. + p.mtx.RUnlock() //nolint:staticcheck // SA2001: this is an intentionally empty critical section. + // Now we can wait a little bit just to increase the chance of a reader getting the lock. + time.Sleep(time.Millisecond) + p.mtx.Lock() } // Iter calls f for each postings list. It aborts if f returns an error and returns it. @@ -370,7 +424,7 @@ func (p *MemPostings) Add(id storage.SeriesRef, lset labels.Labels) { func appendWithExponentialGrowth[T any](a []T, v T) []T { if cap(a) < len(a)+1 { - newList := make([]T, len(a), len(a)*2+1) + newList := make([]T, len(a), len(a)*exponentialSliceGrowthFactor+1) copy(newList, a) a = newList } @@ -383,7 +437,11 @@ func (p *MemPostings) addFor(id storage.SeriesRef, l labels.Label) { nm = map[string][]storage.SeriesRef{} p.m[l.Name] = nm } - list := appendWithExponentialGrowth(nm[l.Value], id) + vm, ok := nm[l.Value] + if !ok { + p.lvs[l.Name] = appendWithExponentialGrowth(p.lvs[l.Name], l.Value) + } + list := appendWithExponentialGrowth(vm, id) nm[l.Value] = list if !p.ordered { @@ -402,25 +460,27 @@ func (p *MemPostings) addFor(id storage.SeriesRef, l labels.Label) { } func (p *MemPostings) PostingsForLabelMatching(ctx context.Context, name string, match func(string) bool) Postings { - // We'll copy the values into a slice and then match over that, + // We'll take the label values slice and then match over that, // this way we don't need to hold the mutex while we're matching, // which can be slow (seconds) if the match function is a huge regex. // Holding this lock prevents new series from being added (slows down the write path) // and blocks the compaction process. - vals := p.labelValues(name) - for i, count := 0, 1; i < len(vals); count++ { - if count%checkContextEveryNIterations == 0 && ctx.Err() != nil { + // + // We just need to make sure we don't modify the slice we took, + // so we'll append matching values to a different one. + p.mtx.RLock() + readOnlyLabelValues := p.lvs[name] + p.mtx.RUnlock() + + vals := make([]string, 0, len(readOnlyLabelValues)) + for i, v := range readOnlyLabelValues { + if i%checkContextEveryNIterations == 0 && ctx.Err() != nil { return ErrPostings(ctx.Err()) } - if match(vals[i]) { - i++ - continue + if match(v) { + vals = append(vals, v) } - - // Didn't match, bring the last value to this position, make the slice shorter and check again. - // The order of the slice doesn't matter as it comes from a map iteration. - vals[i], vals = vals[len(vals)-1], vals[:len(vals)-1] } // If none matched (or this label had no values), no need to grab the lock again. @@ -465,27 +525,6 @@ func (p *MemPostings) PostingsForAllLabelValues(ctx context.Context, name string return Merge(ctx, its...) } -// labelValues returns a slice of label values for the given label name. -// It will take the read lock. -func (p *MemPostings) labelValues(name string) []string { - p.mtx.RLock() - defer p.mtx.RUnlock() - - e := p.m[name] - if len(e) == 0 { - return nil - } - - vals := make([]string, 0, len(e)) - for v, srs := range e { - if len(srs) > 0 { - vals = append(vals, v) - } - } - - return vals -} - // ExpandPostings returns the postings expanded as a slice. func ExpandPostings(p Postings) (res []storage.SeriesRef, err error) { for p.Next() {