From f97ec5f2c30b5b586107ba1cec817ff9b37af837 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 22 Oct 2020 05:31:17 -0400
Subject: [PATCH 001/106] [dbnode] Reduce index query allocations

---
 go.mod                                        |   1 +
 src/cmd/services/m3dbnode/config/cache.go     |  35 +++++-
 src/dbnode/server/server.go                   |   7 ++
 .../storage/index/fields_terms_iterator.go    |  17 ++-
 src/m3ninx/index/regexp.go                    |  81 +++++++++++++-
 src/m3ninx/postings/roaring/roaring.go        |   8 --
 src/m3ninx/postings/types.go                  |   3 -
 src/m3ninx/search/executor/iterator.go        |  14 ++-
 src/m3ninx/search/searcher/all.go             |   8 +-
 src/m3ninx/search/searcher/all_test.go        |   2 +-
 src/m3ninx/search/searcher/conjunction.go     |  47 ++++----
 src/m3ninx/search/searcher/disjunction.go     |  29 +++--
 src/m3ninx/search/searcher/empty.go           |   4 +-
 src/m3ninx/search/searcher/field.go           |   8 +-
 .../search/searcher/lazy_postings_list.go     | 100 ++++++++++++++++++
 src/m3ninx/search/searcher/negation.go        |  10 +-
 src/m3ninx/search/searcher/regexp.go          |   8 +-
 src/m3ninx/search/searcher/term.go            |   8 +-
 src/m3ninx/search/types.go                    |   2 +-
 19 files changed, 316 insertions(+), 76 deletions(-)
 create mode 100644 src/m3ninx/search/searcher/lazy_postings_list.go

diff --git a/go.mod b/go.mod
index 23e7824ff4..85c0b9108b 100644
--- a/go.mod
+++ b/go.mod
@@ -42,6 +42,7 @@ require (
 	github.com/gorilla/handlers v1.4.2 // indirect
 	github.com/gorilla/mux v1.7.3
 	github.com/gotestyourself/gotestyourself v2.2.0+incompatible // indirect
+	github.com/hashicorp/golang-lru v0.5.4
 	github.com/hashicorp/hcl v1.0.1-0.20190611123218-cf7d376da96d // indirect
 	github.com/hydrogen18/stalecucumber v0.0.0-20151102144322-9b38526d4bdf
 	github.com/influxdata/influxdb v1.7.7
diff --git a/src/cmd/services/m3dbnode/config/cache.go b/src/cmd/services/m3dbnode/config/cache.go
index cce694fb6e..65f72dcf14 100644
--- a/src/cmd/services/m3dbnode/config/cache.go
+++ b/src/cmd/services/m3dbnode/config/cache.go
@@ -26,6 +26,7 @@ var (
 	defaultPostingsListCacheSize   = 2 << 11 // 4096
 	defaultPostingsListCacheRegexp = true
 	defaultPostingsListCacheTerms  = true
+	defaultRegexpCacheSize         = 256
 )
 
 // CacheConfigurations is the cache configurations.
@@ -35,6 +36,9 @@ type CacheConfigurations struct {
 
 	// PostingsList cache policy.
 	PostingsList *PostingsListCacheConfiguration `yaml:"postingsList"`
+
+	// Regexp cache policy.
+	Regexp *RegexpCacheConfiguration `yaml:"regexp"`
 }
 
 // SeriesConfiguration returns the series cache configuration or default
@@ -53,10 +57,18 @@ func (c CacheConfigurations) PostingsListConfiguration() PostingsListCacheConfig
 	if c.PostingsList == nil {
 		return PostingsListCacheConfiguration{}
 	}
-
 	return *c.PostingsList
 }
 
+// RegexpConfiguration returns the regexp cache configuration or default
+// if none is specified.
+func (c CacheConfigurations) RegexpConfiguration() RegexpCacheConfiguration {
+	if c.Regexp == nil {
+		return RegexpCacheConfiguration{}
+	}
+	return *c.Regexp
+}
+
 // SeriesCacheConfiguration is the series cache configuration.
 type SeriesCacheConfiguration struct {
 	Policy series.CachePolicy                 `yaml:"policy"`
@@ -79,7 +91,7 @@ type PostingsListCacheConfiguration struct {
 
 // SizeOrDefault returns the provided size or the default value is none is
 // provided.
-func (p *PostingsListCacheConfiguration) SizeOrDefault() int {
+func (p PostingsListCacheConfiguration) SizeOrDefault() int {
 	if p.Size == nil {
 		return defaultPostingsListCacheSize
 	}
@@ -89,7 +101,7 @@ func (p *PostingsListCacheConfiguration) SizeOrDefault() int {
 
 // CacheRegexpOrDefault returns the provided cache regexp configuration value
 // or the default value is none is provided.
-func (p *PostingsListCacheConfiguration) CacheRegexpOrDefault() bool {
+func (p PostingsListCacheConfiguration) CacheRegexpOrDefault() bool {
 	if p.CacheRegexp == nil {
 		return defaultPostingsListCacheRegexp
 	}
@@ -99,10 +111,25 @@ func (p *PostingsListCacheConfiguration) CacheRegexpOrDefault() bool {
 
 // CacheTermsOrDefault returns the provided cache terms configuration value
 // or the default value is none is provided.
-func (p *PostingsListCacheConfiguration) CacheTermsOrDefault() bool {
+func (p PostingsListCacheConfiguration) CacheTermsOrDefault() bool {
 	if p.CacheTerms == nil {
 		return defaultPostingsListCacheTerms
 	}
 
 	return *p.CacheTerms
 }
+
+// RegexpCacheConfiguration is a compiled regexp cache for query regexps.
+type RegexpCacheConfiguration struct {
+	Size *int `yaml:"size"`
+}
+
+// SizeOrDefault returns the provided size or the default value is none is
+// provided.
+func (c RegexpCacheConfiguration) SizeOrDefault() int {
+	if c.Size == nil {
+		return defaultRegexpCacheSize
+	}
+
+	return *c.Size
+}
diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index 5d4f6b8684..493018719a 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -73,6 +73,7 @@ import (
 	xtchannel "github.com/m3db/m3/src/dbnode/x/tchannel"
 	"github.com/m3db/m3/src/dbnode/x/xio"
 	"github.com/m3db/m3/src/dbnode/x/xpool"
+	m3ninxindex "github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/query/api/v1/handler/placement"
@@ -413,6 +414,12 @@ func Run(runOpts RunOptions) {
 	}
 	defer stopReporting()
 
+	// Setup regexp cache.
+	regexpCacheSize := cfg.Cache.RegexpConfiguration().SizeOrDefault()
+	if err := m3ninxindex.SetRegexpCacheSize(regexpCacheSize); err != nil {
+		logger.Fatal("could not set regexp cache size", zap.Error(err))
+	}
+
 	// Setup query stats tracking.
 	docsLimit := limits.DefaultLookbackLimitOptions()
 	bytesReadLimit := limits.DefaultLookbackLimitOptions()
diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index 186c32d47b..94bc5cac33 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -22,6 +22,7 @@ package index
 
 import (
 	"errors"
+	"fmt"
 
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
@@ -122,11 +123,23 @@ func (fti *fieldsAndTermsIter) Reset(reader segment.Reader, opts fieldsAndTermsI
 		return err
 	}
 
-	pl, err := searcher.Search(fti.reader)
+	pl, iter, err := searcher.Search(fti.reader)
 	if err != nil {
 		return err
 	}
 
+	if pl == nil && iter != nil {
+		// Turn iterator into a postings list.
+		mutable := roaring.NewPostingsList()
+		if err := mutable.AddIterator(iter); err != nil {
+			return err
+		}
+		pl = mutable
+	}
+	if pl == nil {
+		return fmt.Errorf("no postings list or iterator returned")
+	}
+
 	// Hold onto the postings bitmap to intersect against on a per term basis.
 	bitmap, ok := roaring.BitmapFromPostingsList(pl)
 	if !ok {
@@ -213,7 +226,7 @@ func (fti *fieldsAndTermsIter) nextTermsIterResult() (bool, error) {
 			return false, errUnpackBitmapFromPostingsList
 		}
 
-		// Check term isn part of at least some of the documents we're
+		// Check term isn't part of at least some of the documents we're
 		// restricted to providing results for based on intersection
 		// count.
 		// Note: IntersectionCount is significantly faster than intersecting and
diff --git a/src/m3ninx/index/regexp.go b/src/m3ninx/index/regexp.go
index 2917422681..678546ccbe 100644
--- a/src/m3ninx/index/regexp.go
+++ b/src/m3ninx/index/regexp.go
@@ -24,8 +24,11 @@ import (
 	"fmt"
 	re "regexp"
 	"regexp/syntax"
+	"sync"
 
 	fstregexp "github.com/m3db/m3/src/m3ninx/index/segment/fst/regexp"
+
+	"github.com/hashicorp/golang-lru/simplelru"
 )
 
 var (
@@ -42,6 +45,53 @@ func init() {
 	dotStarCompiledRegex = re
 }
 
+var (
+	// cache for regexes, as per Go std lib:
+	// A Regexp is safe for concurrent use by multiple goroutines, except for
+	// configuration methods, such as Longest.
+	// The vellum Regexp is also safe for concurrent use as it is query for
+	// states but does not mutate internal state.
+	cacheLock sync.RWMutex
+	cache     *simplelru.LRU
+	cacheSize int
+)
+
+// RegexpCacheSize returns the regex cache size.
+func RegexpCacheSize() int {
+	cacheLock.RLock()
+	n := cacheSize
+	cacheLock.RUnlock()
+	return n
+}
+
+// SetRegexpCacheSize sets the regex cache size, if zero disables cache.
+func SetRegexpCacheSize(size int) error {
+	cacheLock.Lock()
+	defer cacheLock.Unlock()
+
+	if size < 0 {
+		return fmt.Errorf("expected zero or greater size: actual=%d", size)
+	}
+
+	if size == 0 {
+		cache = nil
+		return nil
+	}
+
+	if cache != nil {
+		cache.Resize(size)
+		return nil
+	}
+
+	v, err := simplelru.NewLRU(size, nil)
+	if err != nil {
+		return err
+	}
+
+	cache = v
+	return nil
+}
+
 // DotStarCompiledRegex returns a regexp which matches ".*".
 func DotStarCompiledRegex() CompiledRegex {
 	return dotStarCompiledRegex
@@ -54,8 +104,29 @@ func CompileRegex(r []byte) (CompiledRegex, error) {
 	// Due to peculiarities in the implementation of Vellum, we have to make certain modifications
 	// to all incoming regular expressions to ensure compatibility between them.
 
-	// first, we parse the regular expression into the equivalent regex
 	reString := string(r)
+
+	// Check cache first.
+	var (
+		cachedValue       CompiledRegex
+		cachedValueExists bool
+		cacheEnabled      bool
+	)
+	cacheLock.RLock()
+	cacheEnabled = cache != nil
+	if cacheEnabled {
+		var cached interface{}
+		cached, cachedValueExists = cache.Get(reString)
+		if cachedValueExists {
+			cachedValue = cached.(CompiledRegex)
+		}
+	}
+	cacheLock.RUnlock()
+	if cachedValueExists {
+		return cachedValue, nil
+	}
+
+	// first, we parse the regular expression into the equivalent regex
 	reAst, err := parseRegexp(reString)
 	if err != nil {
 		return CompiledRegex{}, err
@@ -94,6 +165,14 @@ func CompileRegex(r []byte) (CompiledRegex, error) {
 	compiledRegex.PrefixBegin = start
 	compiledRegex.PrefixEnd = end
 
+	// Update cache if cache existed when we checked.
+	if cacheEnabled {
+		cacheLock.Lock()
+		// Check still enabled and if so update it.
+		cache.Add(reString, compiledRegex)
+		cacheLock.Unlock()
+	}
+
 	return compiledRegex, nil
 }
 
diff --git a/src/m3ninx/postings/roaring/roaring.go b/src/m3ninx/postings/roaring/roaring.go
index e1f8a3d820..972dcf684d 100644
--- a/src/m3ninx/postings/roaring/roaring.go
+++ b/src/m3ninx/postings/roaring/roaring.go
@@ -179,14 +179,6 @@ func (d *postingsList) IsEmpty() bool {
 	return d.bitmap.Count() == 0
 }
 
-func (d *postingsList) Max() (postings.ID, error) {
-	if d.IsEmpty() {
-		return 0, postings.ErrEmptyList
-	}
-	max := d.bitmap.Max()
-	return postings.ID(max), nil
-}
-
 func (d *postingsList) Len() int {
 	return int(d.bitmap.Count())
 }
diff --git a/src/m3ninx/postings/types.go b/src/m3ninx/postings/types.go
index dc4f13907f..cc651bdde9 100644
--- a/src/m3ninx/postings/types.go
+++ b/src/m3ninx/postings/types.go
@@ -52,9 +52,6 @@ type List interface {
 	// calculating the size of the postings list.
 	IsEmpty() bool
 
-	// Max returns the maximum ID in the postings list or an error if it is empty.
-	Max() (ID, error)
-
 	// Len returns the numbers of IDs in the postings list.
 	Len() int
 
diff --git a/src/m3ninx/search/executor/iterator.go b/src/m3ninx/search/executor/iterator.go
index a95e95bb75..8289b28c5c 100644
--- a/src/m3ninx/search/executor/iterator.go
+++ b/src/m3ninx/search/executor/iterator.go
@@ -21,6 +21,8 @@
 package executor
 
 import (
+	"fmt"
+
 	"github.com/m3db/m3/src/m3ninx/doc"
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -117,15 +119,17 @@ func (it *iterator) nextIter() (doc.Iterator, bool, error) {
 	}
 
 	reader := it.readers[it.idx]
-	pl, err := it.searcher.Search(reader)
+	pl, iter, err := it.searcher.Search(reader)
 	if err != nil {
 		return nil, false, err
 	}
 
-	iter, err := reader.Docs(pl)
-	if err != nil {
-		return nil, false, err
+	if pl != nil && iter == nil {
+		iter = pl.Iterator()
+	}
+	if iter == nil {
+		return nil, false, fmt.Errorf("no postings list or iterator returned")
 	}
 
-	return iter, true, nil
+	return index.NewIDDocIterator(reader, iter), true, nil
 }
diff --git a/src/m3ninx/search/searcher/all.go b/src/m3ninx/search/searcher/all.go
index 43b1b9d128..a670f868f1 100644
--- a/src/m3ninx/search/searcher/all.go
+++ b/src/m3ninx/search/searcher/all.go
@@ -33,6 +33,10 @@ func NewAllSearcher() search.Searcher {
 	return &all{}
 }
 
-func (s *all) Search(r index.Reader) (postings.List, error) {
-	return r.MatchAll()
+func (s *all) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+	pl, err := r.MatchAll()
+	if err != nil {
+		return nil, nil, err
+	}
+	return pl, nil, nil
 }
diff --git a/src/m3ninx/search/searcher/all_test.go b/src/m3ninx/search/searcher/all_test.go
index c4cedbaaac..18e6f48cf6 100644
--- a/src/m3ninx/search/searcher/all_test.go
+++ b/src/m3ninx/search/searcher/all_test.go
@@ -39,7 +39,7 @@ func TestAllSearcher(t *testing.T) {
 	allPl := roaring.NewPostingsList()
 	reader.EXPECT().MatchAll().Return(allPl, nil)
 
-	pl, err := s.Search(reader)
+	pl, _, err := s.Search(reader)
 	require.NoError(t, err)
 	require.True(t, pl.Equal(allPl))
 }
diff --git a/src/m3ninx/search/searcher/conjunction.go b/src/m3ninx/search/searcher/conjunction.go
index 1473bc8271..c0dda0a42e 100644
--- a/src/m3ninx/search/searcher/conjunction.go
+++ b/src/m3ninx/search/searcher/conjunction.go
@@ -21,6 +21,8 @@
 package searcher
 
 import (
+	"fmt"
+
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -44,45 +46,38 @@ func NewConjunctionSearcher(searchers, negations search.Searchers) (search.Searc
 	}, nil
 }
 
-func (s *conjunctionSearcher) Search(r index.Reader) (postings.List, error) {
-	var pl postings.MutableList
+func (s *conjunctionSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+	var (
+		intersects = make([]postings.List, 0, len(s.searchers))
+		negations  = make([]postings.List, 0, len(s.negations))
+	)
 	for _, sr := range s.searchers {
-		curr, err := sr.Search(r)
+		pl, _, err := sr.Search(r)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
-
-		// TODO: Sort the iterators so that we take the intersection in order of increasing size.
 		if pl == nil {
-			pl = curr.Clone()
-		} else {
-			if err := pl.Intersect(curr); err != nil {
-				return nil, err
-			}
+			return nil, nil, fmt.Errorf("conjunction searchers must resolve postings lists")
 		}
 
-		// We can break early if the interescted postings list is ever empty.
-		if pl.IsEmpty() {
-			break
-		}
+		intersects = append(intersects, pl)
 	}
 
 	for _, sr := range s.negations {
-		curr, err := sr.Search(r)
+		pl, _, err := sr.Search(r)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
-
-		// TODO: Sort the iterators so that we take the set differences in order of decreasing size.
-		if err := pl.Difference(curr); err != nil {
-			return nil, err
+		if pl == nil {
+			return nil, nil, fmt.Errorf("conjunction searchers must resolve postings lists")
 		}
 
-		// We can break early if the interescted postings list is ever empty.
-		if pl.IsEmpty() {
-			break
-		}
+		negations = append(negations, pl)
 	}
 
-	return pl, nil
+	iter, err := newIntersectAndNegatePostingsListIter(intersects, negations)
+	if err != nil {
+		return nil, nil, err
+	}
+	return nil, iter, nil
 }
diff --git a/src/m3ninx/search/searcher/disjunction.go b/src/m3ninx/search/searcher/disjunction.go
index 0cb77e7ec6..264c951744 100644
--- a/src/m3ninx/search/searcher/disjunction.go
+++ b/src/m3ninx/search/searcher/disjunction.go
@@ -21,8 +21,11 @@
 package searcher
 
 import (
+	"fmt"
+
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/search"
 )
 
@@ -42,20 +45,26 @@ func NewDisjunctionSearcher(searchers search.Searchers) (search.Searcher, error)
 	}, nil
 }
 
-func (s *disjunctionSearcher) Search(r index.Reader) (postings.List, error) {
-	var pl postings.MutableList
+func (s *disjunctionSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+	var (
+		union = make([]postings.List, 0, len(s.searchers))
+	)
 	for _, sr := range s.searchers {
-		curr, err := sr.Search(r)
+		pl, _, err := sr.Search(r)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
-
-		// TODO: Sort the iterators so that we take the union in order of decreasing size.
 		if pl == nil {
-			pl = curr.Clone()
-		} else {
-			pl.Union(curr)
+			return nil, nil, fmt.Errorf("disjunction searchers must resolve postings lists")
 		}
+
+		union = append(union, pl)
 	}
-	return pl, nil
+
+	pl, err := roaring.Union(union)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return pl, nil, nil
 }
diff --git a/src/m3ninx/search/searcher/empty.go b/src/m3ninx/search/searcher/empty.go
index e2298c012b..e6ffb67e6f 100644
--- a/src/m3ninx/search/searcher/empty.go
+++ b/src/m3ninx/search/searcher/empty.go
@@ -38,6 +38,6 @@ func NewEmptySearcher() search.Searcher {
 	}
 }
 
-func (s *emptySearcher) Search(r index.Reader) (postings.List, error) {
-	return s.postings, nil
+func (s *emptySearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+	return s.postings, nil, nil
 }
diff --git a/src/m3ninx/search/searcher/field.go b/src/m3ninx/search/searcher/field.go
index 20446fcf2d..3bbe0343d8 100644
--- a/src/m3ninx/search/searcher/field.go
+++ b/src/m3ninx/search/searcher/field.go
@@ -37,6 +37,10 @@ func NewFieldSearcher(field []byte) (search.Searcher, error) {
 	}, nil
 }
 
-func (s *fieldSearcher) Search(r index.Reader) (postings.List, error) {
-	return r.MatchField(s.field)
+func (s *fieldSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+	pl, err := r.MatchField(s.field)
+	if err != nil {
+		return nil, nil, err
+	}
+	return pl, nil, nil
 }
diff --git a/src/m3ninx/search/searcher/lazy_postings_list.go b/src/m3ninx/search/searcher/lazy_postings_list.go
new file mode 100644
index 0000000000..ca22ab3778
--- /dev/null
+++ b/src/m3ninx/search/searcher/lazy_postings_list.go
@@ -0,0 +1,100 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package searcher
+
+import (
+	"errors"
+	"sort"
+
+	"github.com/m3db/m3/src/m3ninx/postings"
+)
+
+var (
+	errNoPostingsLists = errors.New("no postings lists")
+)
+
+var _ postings.Iterator = (*intersectAndNegatePostingsListIter)(nil)
+
+type intersectAndNegatePostingsListIter struct {
+	smallestIntersectIter    postings.Iterator
+	nonSmallestIntersectsAsc []postings.List
+	negationsDesc            []postings.List
+	current                  postings.ID
+}
+
+func newIntersectAndNegatePostingsListIter(
+	intersects []postings.List,
+	negations []postings.List,
+) (postings.Iterator, error) {
+	if len(intersects) == 0 {
+		return nil, errNoPostingsLists
+	}
+
+	// Always intersect using the smallest at top so it can
+	// directly compare if intersected with other results from
+	// other lists.
+	sort.Slice(intersects, func(i, j int) bool {
+		return intersects[i].Len() < intersects[j].Len()
+	})
+	sort.Slice(negations, func(i, j int) bool {
+		return negations[i].Len() > negations[j].Len()
+	})
+	return &intersectAndNegatePostingsListIter{
+		smallestIntersectIter:    intersects[0].Iterator(),
+		nonSmallestIntersectsAsc: intersects[1:],
+		negationsDesc:            negations,
+		current:                  postings.MaxID,
+	}, nil
+}
+
+func (it *intersectAndNegatePostingsListIter) Current() postings.ID {
+	return it.current
+}
+
+func (it *intersectAndNegatePostingsListIter) Next() bool {
+NextValue:
+	for {
+		if !it.smallestIntersectIter.Next() {
+			return false
+		}
+		curr := it.smallestIntersectIter.Current()
+		for _, list := range it.nonSmallestIntersectsAsc {
+			if !list.Contains(curr) {
+				continue NextValue
+			}
+		}
+		for _, list := range it.negationsDesc {
+			if list.Contains(curr) {
+				continue NextValue
+			}
+		}
+		it.current = curr
+		return true
+	}
+}
+
+func (it *intersectAndNegatePostingsListIter) Err() error {
+	return nil
+}
+
+func (it *intersectAndNegatePostingsListIter) Close() error {
+	return nil
+}
diff --git a/src/m3ninx/search/searcher/negation.go b/src/m3ninx/search/searcher/negation.go
index 5df70c4e8b..328d7f6dc5 100644
--- a/src/m3ninx/search/searcher/negation.go
+++ b/src/m3ninx/search/searcher/negation.go
@@ -38,17 +38,17 @@ func NewNegationSearcher(s search.Searcher) (search.Searcher, error) {
 	}, nil
 }
 
-func (s *negationSearcher) Search(r index.Reader) (postings.List, error) {
+func (s *negationSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
 	pl, err := r.MatchAll()
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
-	sPl, err := s.searcher.Search(r)
+	sPl, _, err := s.searcher.Search(r)
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
 	pl.Difference(sPl)
-	return pl, nil
+	return pl, nil, nil
 }
diff --git a/src/m3ninx/search/searcher/regexp.go b/src/m3ninx/search/searcher/regexp.go
index f4c1688d95..8141f2597b 100644
--- a/src/m3ninx/search/searcher/regexp.go
+++ b/src/m3ninx/search/searcher/regexp.go
@@ -40,6 +40,10 @@ func NewRegexpSearcher(field []byte, compiled index.CompiledRegex) search.Search
 	}
 }
 
-func (s *regexpSearcher) Search(r index.Reader) (postings.List, error) {
-	return r.MatchRegexp(s.field, s.compiled)
+func (s *regexpSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+	pl, err := r.MatchRegexp(s.field, s.compiled)
+	if err != nil {
+		return nil, nil, err
+	}
+	return pl, nil, nil
 }
diff --git a/src/m3ninx/search/searcher/term.go b/src/m3ninx/search/searcher/term.go
index b550e1211f..f9e4da1dbb 100644
--- a/src/m3ninx/search/searcher/term.go
+++ b/src/m3ninx/search/searcher/term.go
@@ -38,6 +38,10 @@ func NewTermSearcher(field, term []byte) search.Searcher {
 	}
 }
 
-func (s *termSearcher) Search(r index.Reader) (postings.List, error) {
-	return r.MatchTerm(s.field, s.term)
+func (s *termSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+	pl, err := r.MatchTerm(s.field, s.term)
+	if err != nil {
+		return nil, nil, err
+	}
+	return pl, nil, nil
 }
diff --git a/src/m3ninx/search/types.go b/src/m3ninx/search/types.go
index d4fc76b4ab..cf233a25ae 100644
--- a/src/m3ninx/search/types.go
+++ b/src/m3ninx/search/types.go
@@ -56,7 +56,7 @@ type Query interface {
 // of the documents it matches for the given segment.
 type Searcher interface {
 	// Search executes a configured query against the given Reader.
-	Search(index.Reader) (postings.List, error)
+	Search(index.Reader) (postings.List, postings.Iterator, error)
 }
 
 // Searchers is a slice of Searcher.

From 17b73980004fe5061be4efd29d5d4eec9d76cfe8 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 23 Oct 2020 17:26:22 -0400
Subject: [PATCH 002/106] Use new LRU cache and instrument

---
 .../m3coordinator/ingest/carbon/ingest.go     |  49 +-
 .../services/m3coordinator/ingest/metrics.go  |  90 +++
 .../m3coordinator/ingest/metrics_test.go      |  49 ++
 src/dbnode/server/server.go                   |  10 +-
 src/m3ninx/index/regexp.go                    | 117 ++--
 src/m3ninx/index/regexp_test.go               |  20 +
 .../api/v1/handler/prometheus/remote/write.go |  53 +-
 src/x/cache/cache.go                          |  55 ++
 src/x/cache/lru_cache.go                      | 521 ++++++++++++++
 src/x/cache/lru_cache_test.go                 | 638 ++++++++++++++++++
 src/x/cache/nop_cache.go                      |  47 ++
 src/x/tallytest/tallytest.go                  | 114 ++++
 12 files changed, 1640 insertions(+), 123 deletions(-)
 create mode 100644 src/cmd/services/m3coordinator/ingest/metrics.go
 create mode 100644 src/cmd/services/m3coordinator/ingest/metrics_test.go
 create mode 100644 src/x/cache/cache.go
 create mode 100644 src/x/cache/lru_cache.go
 create mode 100644 src/x/cache/lru_cache_test.go
 create mode 100644 src/x/cache/nop_cache.go
 create mode 100644 src/x/tallytest/tallytest.go

diff --git a/src/cmd/services/m3coordinator/ingest/carbon/ingest.go b/src/cmd/services/m3coordinator/ingest/carbon/ingest.go
index 26261b3189..0061c44ea1 100644
--- a/src/cmd/services/m3coordinator/ingest/carbon/ingest.go
+++ b/src/cmd/services/m3coordinator/ingest/carbon/ingest.go
@@ -127,17 +127,20 @@ func NewIngester(
 		}
 	})
 
+	scope := opts.InstrumentOptions.MetricsScope()
+	metrics, err := newCarbonIngesterMetrics(scope)
+	if err != nil {
+		return nil, err
+	}
+
 	return &ingester{
 		downsamplerAndWriter: downsamplerAndWriter,
 		opts:                 opts,
 		logger:               opts.InstrumentOptions.Logger(),
 		tagOpts:              tagOpts,
-		metrics: newCarbonIngesterMetrics(
-			opts.InstrumentOptions.MetricsScope()),
-
-		rules: compiledRules,
-
-		lineResourcesPool: resourcePool,
+		metrics:              metrics,
+		rules:                compiledRules,
+		lineResourcesPool:    resourcePool,
 	}, nil
 }
 
@@ -283,10 +286,8 @@ func (i *ingester) writeWithOptions(
 		return err
 	}
 
-	err = i.downsamplerAndWriter.Write(
-		ctx, tags, resources.datapoints, xtime.Second, nil, opts,
-	)
-
+	err = i.downsamplerAndWriter.Write(ctx, tags, resources.datapoints,
+		xtime.Second, nil, opts)
 	if err != nil {
 		i.logger.Error("err writing carbon metric",
 			zap.String("name", string(resources.name)), zap.Error(err))
@@ -301,18 +302,26 @@ func (i *ingester) Close() {
 	// We don't maintain any state in-between connections so there is nothing to do here.
 }
 
-func newCarbonIngesterMetrics(m tally.Scope) carbonIngesterMetrics {
-	return carbonIngesterMetrics{
-		success:   m.Counter("success"),
-		err:       m.Counter("error"),
-		malformed: m.Counter("malformed"),
-	}
+type carbonIngesterMetrics struct {
+	success       tally.Counter
+	err           tally.Counter
+	malformed     tally.Counter
+	ingestLatency tally.Histogram
+	writeLatency  tally.Histogram
 }
 
-type carbonIngesterMetrics struct {
-	success   tally.Counter
-	err       tally.Counter
-	malformed tally.Counter
+func newCarbonIngesterMetrics(scope tally.Scope) (carbonIngesterMetrics, error) {
+	buckets, err := ingest.NewLatencyBuckets()
+	if err != nil {
+		return carbonIngesterMetrics{}, err
+	}
+	return carbonIngesterMetrics{
+		success:       scope.Counter("success"),
+		err:           scope.Counter("error"),
+		malformed:     scope.Counter("malformed"),
+		writeLatency:  scope.SubScope("write").Histogram("latency", buckets.WriteLatencyBuckets),
+		ingestLatency: scope.SubScope("ingest").Histogram("latency", buckets.IngestLatencyBuckets),
+	}, nil
 }
 
 // GenerateTagsFromName accepts a carbon metric name and blows it up into a list of
diff --git a/src/cmd/services/m3coordinator/ingest/metrics.go b/src/cmd/services/m3coordinator/ingest/metrics.go
new file mode 100644
index 0000000000..ad514fc117
--- /dev/null
+++ b/src/cmd/services/m3coordinator/ingest/metrics.go
@@ -0,0 +1,90 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package ingest
+
+import (
+	"time"
+
+	"github.com/uber-go/tally"
+)
+
+// LatencyBuckets are a set of latency buckets useful for measuring things.
+type LatencyBuckets struct {
+	WriteLatencyBuckets  tally.DurationBuckets
+	IngestLatencyBuckets tally.DurationBuckets
+}
+
+// NewLatencyBuckets returns write and ingest latency buckets useful for
+// measuring ingest latency (i.e. time from datapoint/sample created to time
+// ingested) and write latency (i.e. time from received a sample from remote
+// source to completion of that write locally).
+func NewLatencyBuckets() (LatencyBuckets, error) {
+	upTo1sBuckets, err := tally.LinearDurationBuckets(0, 100*time.Millisecond, 10)
+	if err != nil {
+		return LatencyBuckets{}, err
+	}
+
+	upTo10sBuckets, err := tally.LinearDurationBuckets(time.Second, 500*time.Millisecond, 18)
+	if err != nil {
+		return LatencyBuckets{}, err
+	}
+
+	upTo60sBuckets, err := tally.LinearDurationBuckets(10*time.Second, 5*time.Second, 11)
+	if err != nil {
+		return LatencyBuckets{}, err
+	}
+
+	upTo60mBuckets, err := tally.LinearDurationBuckets(0, 5*time.Minute, 12)
+	if err != nil {
+		return LatencyBuckets{}, err
+	}
+	upTo60mBuckets = upTo60mBuckets[1:] // Remove the first 0s to get 5 min aligned buckets
+
+	upTo6hBuckets, err := tally.LinearDurationBuckets(time.Hour, 30*time.Minute, 12)
+	if err != nil {
+		return LatencyBuckets{}, err
+	}
+
+	upTo24hBuckets, err := tally.LinearDurationBuckets(6*time.Hour, time.Hour, 19)
+	if err != nil {
+		return LatencyBuckets{}, err
+	}
+	upTo24hBuckets = upTo24hBuckets[1:] // Remove the first 6h to get 1 hour aligned buckets
+
+	var writeLatencyBuckets tally.DurationBuckets
+	writeLatencyBuckets = append(writeLatencyBuckets, upTo1sBuckets...)
+	writeLatencyBuckets = append(writeLatencyBuckets, upTo10sBuckets...)
+	writeLatencyBuckets = append(writeLatencyBuckets, upTo60sBuckets...)
+	writeLatencyBuckets = append(writeLatencyBuckets, upTo60mBuckets...)
+
+	var ingestLatencyBuckets tally.DurationBuckets
+	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo1sBuckets...)
+	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo10sBuckets...)
+	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo60sBuckets...)
+	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo60mBuckets...)
+	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo6hBuckets...)
+	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo24hBuckets...)
+
+	return LatencyBuckets{
+		WriteLatencyBuckets:  writeLatencyBuckets,
+		IngestLatencyBuckets: ingestLatencyBuckets,
+	}, nil
+}
diff --git a/src/cmd/services/m3coordinator/ingest/metrics_test.go b/src/cmd/services/m3coordinator/ingest/metrics_test.go
new file mode 100644
index 0000000000..123e936353
--- /dev/null
+++ b/src/cmd/services/m3coordinator/ingest/metrics_test.go
@@ -0,0 +1,49 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package ingest
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestLatencyBuckets(t *testing.T) {
+	buckets, err := NewLatencyBuckets()
+	require.NoError(t, err)
+
+	// NB(r): Bucket length is tested just to sanity check how many buckets we are creating
+	require.Equal(t, 50, len(buckets.WriteLatencyBuckets.AsDurations()))
+
+	// NB(r): Bucket values are tested to sanity check they look right
+	expected := "[0s 100ms 200ms 300ms 400ms 500ms 600ms 700ms 800ms 900ms 1s 1.5s 2s 2.5s 3s 3.5s 4s 4.5s 5s 5.5s 6s 6.5s 7s 7.5s 8s 8.5s 9s 9.5s 10s 15s 20s 25s 30s 35s 40s 45s 50s 55s 1m0s 5m0s 10m0s 15m0s 20m0s 25m0s 30m0s 35m0s 40m0s 45m0s 50m0s 55m0s]"
+	actual := fmt.Sprintf("%v", buckets.WriteLatencyBuckets.AsDurations())
+	require.Equal(t, expected, actual)
+
+	// NB(r): Bucket length is tested just to sanity check how many buckets we are creating
+	require.Equal(t, 80, len(buckets.IngestLatencyBuckets.AsDurations()))
+
+	// NB(r): Bucket values are tested to sanity check they look right
+	expected = "[0s 100ms 200ms 300ms 400ms 500ms 600ms 700ms 800ms 900ms 1s 1.5s 2s 2.5s 3s 3.5s 4s 4.5s 5s 5.5s 6s 6.5s 7s 7.5s 8s 8.5s 9s 9.5s 10s 15s 20s 25s 30s 35s 40s 45s 50s 55s 1m0s 5m0s 10m0s 15m0s 20m0s 25m0s 30m0s 35m0s 40m0s 45m0s 50m0s 55m0s 1h0m0s 1h30m0s 2h0m0s 2h30m0s 3h0m0s 3h30m0s 4h0m0s 4h30m0s 5h0m0s 5h30m0s 6h0m0s 6h30m0s 7h0m0s 8h0m0s 9h0m0s 10h0m0s 11h0m0s 12h0m0s 13h0m0s 14h0m0s 15h0m0s 16h0m0s 17h0m0s 18h0m0s 19h0m0s 20h0m0s 21h0m0s 22h0m0s 23h0m0s 24h0m0s]"
+	actual = fmt.Sprintf("%v", buckets.IngestLatencyBuckets.AsDurations())
+	require.Equal(t, expected, actual)
+}
diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index 493018719a..152e81eb6d 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -414,11 +414,11 @@ func Run(runOpts RunOptions) {
 	}
 	defer stopReporting()
 
-	// Setup regexp cache.
-	regexpCacheSize := cfg.Cache.RegexpConfiguration().SizeOrDefault()
-	if err := m3ninxindex.SetRegexpCacheSize(regexpCacheSize); err != nil {
-		logger.Fatal("could not set regexp cache size", zap.Error(err))
-	}
+	// Setup index regexp compilation cache.
+	m3ninxindex.SetRegexpCacheOptions(m3ninxindex.RegexpCacheOptions{
+		Size:  cfg.Cache.RegexpConfiguration().SizeOrDefault(),
+		Scope: iopts.MetricsScope(),
+	})
 
 	// Setup query stats tracking.
 	docsLimit := limits.DefaultLookbackLimitOptions()
diff --git a/src/m3ninx/index/regexp.go b/src/m3ninx/index/regexp.go
index 678546ccbe..bd1a0ff81c 100644
--- a/src/m3ninx/index/regexp.go
+++ b/src/m3ninx/index/regexp.go
@@ -21,20 +21,23 @@
 package index
 
 import (
+	"context"
 	"fmt"
 	re "regexp"
 	"regexp/syntax"
 	"sync"
 
-	fstregexp "github.com/m3db/m3/src/m3ninx/index/segment/fst/regexp"
+	"github.com/uber-go/tally"
 
-	"github.com/hashicorp/golang-lru/simplelru"
+	fstregexp "github.com/m3db/m3/src/m3ninx/index/segment/fst/regexp"
+	"github.com/m3db/m3/src/x/cache"
 )
 
 var (
 	// dotStartCompiledRegex is a CompileRegex that matches any input.
 	// NB: It can be accessed through DotStartCompiledRegex().
 	dotStarCompiledRegex CompiledRegex
+	cacheContext         = context.Background()
 )
 
 func init() {
@@ -51,45 +54,54 @@ var (
 	// configuration methods, such as Longest.
 	// The vellum Regexp is also safe for concurrent use as it is query for
 	// states but does not mutate internal state.
-	cacheLock sync.RWMutex
-	cache     *simplelru.LRU
-	cacheSize int
+	regexpCacheLock    sync.RWMutex
+	regexpCache        *cache.LRU
+	regexpCacheSize    int
+	regexpCacheMetrics *cacheMetrics
 )
 
-// RegexpCacheSize returns the regex cache size.
-func RegexpCacheSize() int {
-	cacheLock.RLock()
-	n := cacheSize
-	cacheLock.RUnlock()
-	return n
+type cacheMetrics struct {
+	hit           tally.Counter
+	miss          tally.Counter
+	errors        tally.Counter
+	unwrapSuccess tally.Counter
+	unwrapError   tally.Counter
 }
 
-// SetRegexpCacheSize sets the regex cache size, if zero disables cache.
-func SetRegexpCacheSize(size int) error {
-	cacheLock.Lock()
-	defer cacheLock.Unlock()
+// RegexpCacheOptions is a set of regexp cache options.
+type RegexpCacheOptions struct {
+	Size  int
+	Scope tally.Scope
+}
 
-	if size < 0 {
-		return fmt.Errorf("expected zero or greater size: actual=%d", size)
-	}
+// SetRegexpCacheOptions sets the regex cache options, size zero disables cache.
+func SetRegexpCacheOptions(opts RegexpCacheOptions) {
+	regexpCacheLock.Lock()
+	defer regexpCacheLock.Unlock()
 
-	if size == 0 {
-		cache = nil
-		return nil
+	if opts.Size < 1 {
+		regexpCache = nil
+		regexpCacheMetrics = nil
+		return
 	}
 
-	if cache != nil {
-		cache.Resize(size)
-		return nil
+	scope := tally.NoopScope
+	if opts.Scope != nil {
+		scope = opts.Scope
 	}
 
-	v, err := simplelru.NewLRU(size, nil)
-	if err != nil {
-		return err
+	scope = scope.SubScope("m3ninx").SubScope("regexp").SubScope("cache")
+	regexpCache = cache.NewLRU(&cache.LRUOptions{
+		MaxEntries: opts.Size,
+		Metrics:    scope.SubScope("lru"),
+	})
+	regexpCacheMetrics = &cacheMetrics{
+		hit:           scope.Counter("hit"),
+		miss:          scope.Counter("miss"),
+		errors:        scope.Counter("errors"),
+		unwrapSuccess: scope.SubScope("unwrap").Counter("success"),
+		unwrapError:   scope.SubScope("unwrap").Counter("error"),
 	}
-
-	cache = v
-	return nil
 }
 
 // DotStarCompiledRegex returns a regexp which matches ".*".
@@ -107,24 +119,27 @@ func CompileRegex(r []byte) (CompiledRegex, error) {
 	reString := string(r)
 
 	// Check cache first.
-	var (
-		cachedValue       CompiledRegex
-		cachedValueExists bool
-		cacheEnabled      bool
-	)
-	cacheLock.RLock()
-	cacheEnabled = cache != nil
-	if cacheEnabled {
-		var cached interface{}
-		cached, cachedValueExists = cache.Get(reString)
-		if cachedValueExists {
-			cachedValue = cached.(CompiledRegex)
+	regexpCacheLock.RLock()
+	cacheLRU := regexpCache
+	cacheLRUMetrics := regexpCacheMetrics
+	regexpCacheLock.RUnlock()
+
+	if cacheLRU != nil && cacheLRUMetrics != nil {
+		cached, err := regexpCache.GetWithTTL(cacheContext, reString, nil)
+		if err != nil && err != cache.ErrEntryNotFound {
+			cacheLRUMetrics.errors.Inc(1)
+		} else if err == cache.ErrEntryNotFound || cached == nil {
+			cacheLRUMetrics.miss.Inc(1)
+		} else {
+			cacheLRUMetrics.hit.Inc(1)
+			if unwrapped, ok := cached.(*CompiledRegex); ok {
+				cacheLRUMetrics.unwrapSuccess.Inc(1)
+				return *unwrapped, nil
+			}
+			// Unable to unwrap into expected type.
+			cacheLRUMetrics.unwrapError.Inc(1)
 		}
 	}
-	cacheLock.RUnlock()
-	if cachedValueExists {
-		return cachedValue, nil
-	}
 
 	// first, we parse the regular expression into the equivalent regex
 	reAst, err := parseRegexp(reString)
@@ -166,11 +181,11 @@ func CompileRegex(r []byte) (CompiledRegex, error) {
 	compiledRegex.PrefixEnd = end
 
 	// Update cache if cache existed when we checked.
-	if cacheEnabled {
-		cacheLock.Lock()
-		// Check still enabled and if so update it.
-		cache.Add(reString, compiledRegex)
-		cacheLock.Unlock()
+	if cacheLRU != nil {
+		// Copy of compiled regex.
+		copied := compiledRegex
+		// No need to lock on Put since cache is locked.
+		cacheLRU.Put(reString, &copied)
 	}
 
 	return compiledRegex, nil
diff --git a/src/m3ninx/index/regexp_test.go b/src/m3ninx/index/regexp_test.go
index d2bff03767..7762ee622a 100644
--- a/src/m3ninx/index/regexp_test.go
+++ b/src/m3ninx/index/regexp_test.go
@@ -27,8 +27,11 @@ import (
 	"testing"
 	"unicode"
 
+	"github.com/m3db/m3/src/x/tallytest"
+
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	"github.com/uber-go/tally"
 )
 
 func TestEnsureSyntaxPerlTreatsAnchorsAsTextTerminator(t *testing.T) {
@@ -411,3 +414,20 @@ func dumpRegexpHelper(b *strings.Builder, re *syntax.Regexp) {
 	}
 	b.WriteByte('}')
 }
+
+func TestRegexpCache(t *testing.T) {
+	scope := tally.NewTestScope("", nil)
+
+	SetRegexpCacheSize(RegexpCacheOptions{Size: 1, Scope: scope})
+	defer SetRegexpCacheSize(RegexpCacheOptions{Size: 0})
+
+	_, err := CompileRegex([]byte("foo.*bar"))
+	require.NoError(t, err)
+
+	tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.miss", nil)
+
+	_, err = CompileRegex([]byte("foo.*bar"))
+	require.NoError(t, err)
+
+	tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.hit", nil)
+}
diff --git a/src/query/api/v1/handler/prometheus/remote/write.go b/src/query/api/v1/handler/prometheus/remote/write.go
index 063670a304..0c98f2f8d7 100644
--- a/src/query/api/v1/handler/prometheus/remote/write.go
+++ b/src/query/api/v1/handler/prometheus/remote/write.go
@@ -197,63 +197,22 @@ type promWriteMetrics struct {
 }
 
 func newPromWriteMetrics(scope tally.Scope) (promWriteMetrics, error) {
-	upTo1sBuckets, err := tally.LinearDurationBuckets(0, 100*time.Millisecond, 10)
+	buckets, err := ingest.NewLatencyBuckets()
 	if err != nil {
 		return promWriteMetrics{}, err
 	}
-
-	upTo10sBuckets, err := tally.LinearDurationBuckets(time.Second, 500*time.Millisecond, 18)
-	if err != nil {
-		return promWriteMetrics{}, err
-	}
-
-	upTo60sBuckets, err := tally.LinearDurationBuckets(10*time.Second, 5*time.Second, 11)
-	if err != nil {
-		return promWriteMetrics{}, err
-	}
-
-	upTo60mBuckets, err := tally.LinearDurationBuckets(0, 5*time.Minute, 12)
-	if err != nil {
-		return promWriteMetrics{}, err
-	}
-	upTo60mBuckets = upTo60mBuckets[1:] // Remove the first 0s to get 5 min aligned buckets
-
-	upTo6hBuckets, err := tally.LinearDurationBuckets(time.Hour, 30*time.Minute, 12)
-	if err != nil {
-		return promWriteMetrics{}, err
-	}
-
-	upTo24hBuckets, err := tally.LinearDurationBuckets(6*time.Hour, time.Hour, 19)
-	if err != nil {
-		return promWriteMetrics{}, err
-	}
-	upTo24hBuckets = upTo24hBuckets[1:] // Remove the first 6h to get 1 hour aligned buckets
-
-	var writeLatencyBuckets tally.DurationBuckets
-	writeLatencyBuckets = append(writeLatencyBuckets, upTo1sBuckets...)
-	writeLatencyBuckets = append(writeLatencyBuckets, upTo10sBuckets...)
-	writeLatencyBuckets = append(writeLatencyBuckets, upTo60sBuckets...)
-	writeLatencyBuckets = append(writeLatencyBuckets, upTo60mBuckets...)
-
-	var ingestLatencyBuckets tally.DurationBuckets
-	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo1sBuckets...)
-	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo10sBuckets...)
-	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo60sBuckets...)
-	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo60mBuckets...)
-	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo6hBuckets...)
-	ingestLatencyBuckets = append(ingestLatencyBuckets, upTo24hBuckets...)
 	return promWriteMetrics{
 		writeSuccess:             scope.SubScope("write").Counter("success"),
 		writeErrorsServer:        scope.SubScope("write").Tagged(map[string]string{"code": "5XX"}).Counter("errors"),
 		writeErrorsClient:        scope.SubScope("write").Tagged(map[string]string{"code": "4XX"}).Counter("errors"),
-		writeBatchLatency:        scope.SubScope("write").Histogram("batch-latency", writeLatencyBuckets),
-		writeBatchLatencyBuckets: writeLatencyBuckets,
-		ingestLatency:            scope.SubScope("ingest").Histogram("latency", ingestLatencyBuckets),
-		ingestLatencyBuckets:     ingestLatencyBuckets,
+		writeBatchLatency:        scope.SubScope("write").Histogram("batch-latency", buckets.WriteLatencyBuckets),
+		writeBatchLatencyBuckets: buckets.WriteLatencyBuckets,
+		ingestLatency:            scope.SubScope("ingest").Histogram("latency", buckets.IngestLatencyBuckets),
+		ingestLatencyBuckets:     buckets.IngestLatencyBuckets,
 		forwardSuccess:           scope.SubScope("forward").Counter("success"),
 		forwardErrors:            scope.SubScope("forward").Counter("errors"),
 		forwardDropped:           scope.SubScope("forward").Counter("dropped"),
-		forwardLatency:           scope.SubScope("forward").Histogram("latency", writeLatencyBuckets),
+		forwardLatency:           scope.SubScope("forward").Histogram("latency", buckets.WriteLatencyBuckets),
 	}, nil
 }
 
diff --git a/src/x/cache/cache.go b/src/x/cache/cache.go
new file mode 100644
index 0000000000..db44b671a8
--- /dev/null
+++ b/src/x/cache/cache.go
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package cache
+
+import (
+	"context"
+	"time"
+)
+
+// A LoaderWithTTLFunc is a function for loading entries from a cache, also
+// returning an expiration time.
+type LoaderWithTTLFunc func(ctx context.Context, key string) (interface{}, time.Time, error)
+
+// A LoaderFunc is a function for loading entries from a cache.
+type LoaderFunc func(ctx context.Context, key string) (interface{}, error)
+
+// Cache is an interface for caches.
+type Cache interface {
+	// Put puts a new item in the cache with the default TTL.
+	Put(key string, value interface{})
+
+	// PutWithTTL puts a new item in the cache with a specific TTL.
+	PutWithTTL(key string, value interface{}, ttl time.Duration)
+
+	// Get returns the value associated with the key, optionally
+	// loading it if it does not exist or has expired.
+	// NB(mmihic): We pass the loader as an argument rather than
+	// making it a property of the cache to support access specific
+	// loading arguments which might not be bundled into the key.
+	Get(ctx context.Context, key string, loader LoaderFunc) (interface{}, error)
+
+	// GetWithTTL returns the value associated with the key, optionally
+	// loading it if it does not exist or has expired, and allowing the
+	// loader to return a TTL for the resulting value, overriding the
+	// default TTL associated with the cache.
+	GetWithTTL(ctx context.Context, key string, loader LoaderWithTTLFunc) (interface{}, error)
+}
diff --git a/src/x/cache/lru_cache.go b/src/x/cache/lru_cache.go
new file mode 100644
index 0000000000..6a20e1b722
--- /dev/null
+++ b/src/x/cache/lru_cache.go
@@ -0,0 +1,521 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package cache
+
+import (
+	"container/list"
+	"context"
+	"errors"
+	"math"
+	"sync"
+	"time"
+
+	"github.com/uber-go/tally"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// Defaults for use with the LRU cache.
+const (
+	DefaultTTL        = time.Minute * 30
+	DefaultMaxEntries = 10000
+)
+
+// Metrics names
+const (
+	loadTimesHistogram  = "load_times"
+	loadAttemptsCounter = "load_attempts"
+	loadsCounter        = "loads"
+	accessCounter       = "accesses"
+	entriesGauge        = "entries"
+)
+
+// Metrics tags
+var (
+	hitsTags    = map[string]string{"status": "hit"}
+	missesTags  = map[string]string{"status": "miss"}
+	successTags = map[string]string{"status": "success"}
+	failureTags = map[string]string{"status": "error"}
+)
+
+// An UncachedError can be used to wrap an error that should not be
+// cached, even if the cache is caching errors. The underlying error
+// will be unwrapped before returning to the caller.
+type UncachedError struct {
+	Err error
+}
+
+// Error returns the message for the underlying error
+func (e UncachedError) Error() string {
+	return e.Err.Error()
+}
+
+// Unwrap unwraps the underlying error
+func (e UncachedError) Unwrap() error {
+	return e.Err
+}
+
+// As returns true if the caller is asking for the error as an uncached error
+func (e UncachedError) As(target interface{}) bool {
+	if uncached, ok := target.(*UncachedError); ok {
+		uncached.Err = e.Err
+		return true
+	}
+	return false
+}
+
+// CachedError is a wrapper that can be used to force an error to
+// be cached. Useful when you want to opt-in to error caching. The
+// underlying error will be unwrapped before returning to the caller.
+type CachedError struct {
+	Err error
+}
+
+// Error returns the message for the underlying error
+func (e CachedError) Error() string {
+	return e.Err.Error()
+}
+
+// Unwrap unwraps the underlying error
+func (e CachedError) Unwrap() error {
+	return e.Err
+}
+
+// As returns true if the caller is asking for the error as an cached error
+func (e CachedError) As(target interface{}) bool {
+	if cached, ok := target.(*CachedError); ok {
+		cached.Err = e.Err
+		return true
+	}
+	return false
+}
+
+var (
+	_ error = UncachedError{}
+	_ error = CachedError{}
+)
+
+var (
+	// ErrEntryNotFound is returned if a cache entry cannot be found.
+	ErrEntryNotFound = status.Error(codes.NotFound, "not found")
+
+	// ErrCacheFull is returned if we need to load an entry, but the cache is already full of entries that are loading.
+	ErrCacheFull = status.Error(codes.ResourceExhausted, "try again later")
+)
+
+// LRUOptions are the options to an LRU cache.
+type LRUOptions struct {
+	TTL                  time.Duration
+	InitialSize          int
+	MaxEntries           int
+	MaxConcurrency       int
+	CacheErrorsByDefault bool
+	Metrics              tally.Scope
+	Now                  func() time.Time
+}
+
+// LRU is a fixed size LRU cache supporting expiration, loading of entries that
+// do not exist, ability to cache negative results (e.g errors from load), and a
+// mechanism for preventing multiple goroutines from entering  the loader function
+// simultaneously for the same key, and a mechanism for restricting the amount of
+// total concurrency in the loader function.
+type LRU struct {
+	// TODO(mmihic): Consider striping these mutexes + the map entry so writes only
+	// take a lock out on a subset of the cache
+	mut               sync.Mutex
+	metrics           *lruCacheMetrics
+	cacheErrors       bool
+	maxEntries        int
+	ttl               time.Duration
+	concurrencyLeases chan struct{}
+	now               func() time.Time
+	byAccessTime      *list.List
+	byLoadTime        *list.List
+	entries           map[string]*lruCacheEntry
+}
+
+// NewLRU returns a new LRU with the provided options.
+func NewLRU(opts *LRUOptions) *LRU {
+	if opts == nil {
+		opts = &LRUOptions{}
+	}
+
+	ttl := opts.TTL
+	if ttl == 0 {
+		ttl = DefaultTTL
+	}
+
+	maxEntries := opts.MaxEntries
+	if maxEntries <= 0 {
+		maxEntries = DefaultMaxEntries
+	}
+
+	initialSize := opts.InitialSize
+	if initialSize <= 0 {
+		initialSize = int(math.Min(1000, float64(maxEntries)))
+	}
+
+	tallyScope := opts.Metrics
+	if tallyScope == nil {
+		tallyScope = tally.NoopScope
+	}
+
+	now := opts.Now
+	if now == nil {
+		now = time.Now
+	}
+
+	var concurrencyLeases chan struct{}
+	if opts.MaxConcurrency > 0 {
+		concurrencyLeases = make(chan struct{}, opts.MaxConcurrency)
+		for i := 0; i < opts.MaxConcurrency; i++ {
+			concurrencyLeases <- struct{}{}
+		}
+	}
+
+	return &LRU{
+		ttl:               ttl,
+		now:               now,
+		maxEntries:        maxEntries,
+		cacheErrors:       opts.CacheErrorsByDefault,
+		concurrencyLeases: concurrencyLeases,
+		metrics: &lruCacheMetrics{
+			entries:       tallyScope.Gauge(entriesGauge),
+			hits:          tallyScope.Tagged(hitsTags).Counter(accessCounter),
+			misses:        tallyScope.Tagged(missesTags).Counter(accessCounter),
+			loadAttempts:  tallyScope.Counter(loadAttemptsCounter),
+			loadSuccesses: tallyScope.Tagged(successTags).Counter(loadsCounter),
+			loadFailures:  tallyScope.Tagged(failureTags).Counter(loadsCounter),
+			loadTimes:     tallyScope.Histogram(loadTimesHistogram, tally.DefaultBuckets),
+		},
+		byAccessTime: list.New(),
+		byLoadTime:   list.New(),
+		entries:      make(map[string]*lruCacheEntry, initialSize),
+	}
+}
+
+// Put puts a value directly into the cache. Uses the default TTL.
+func (c *LRU) Put(key string, value interface{}) {
+	c.PutWithTTL(key, value, 0)
+}
+
+// PutWithTTL puts a value directly into the cache with a custom TTL.
+func (c *LRU) PutWithTTL(key string, value interface{}, ttl time.Duration) {
+	var expiresAt time.Time
+	if ttl > 0 {
+		expiresAt = c.now().Add(ttl)
+	}
+
+	c.mut.Lock()
+	defer c.mut.Unlock()
+
+	_, _ = c.updateCacheEntry(key, expiresAt, value, nil)
+}
+
+// Get returns the value associated with the key, optionally
+// loading it if it does not exist or has expired.
+// NB(mmihic): We pass the loader as an argument rather than
+// making it a property of the cache to support access specific
+// loading arguments which might not be bundled into the key.
+func (c *LRU) Get(ctx context.Context, key string, loader LoaderFunc) (interface{}, error) {
+	return c.GetWithTTL(ctx, key, func(ctx context.Context, key string) (interface{}, time.Time, error) {
+		val, err := loader(ctx, key)
+		return val, time.Time{}, err
+	})
+}
+
+// GetWithTTL returns the value associated with the key, optionally
+// loading it if it does not exist or has expired, and allowing the
+// loader to return a TTL for the resulting value, overriding the
+// default TTL associated with the cache.
+func (c *LRU) GetWithTTL(ctx context.Context, key string, loader LoaderWithTTLFunc) (interface{}, error) {
+	// Spin until it's either loaded or the load fails.
+	for {
+		value, load, loadingCh, err := c.tryCached(key)
+
+		// There was a cached error, so just return it
+		if err != nil {
+			return nil, err
+		}
+
+		// Someone else is loading the entry, wait for this to complete
+		// (or the context to end) and try to acquire again.
+		if loadingCh != nil {
+			select {
+			case <-ctx.Done():
+				return nil, ctx.Err()
+			case <-loadingCh:
+			}
+			continue
+		}
+
+		// No entry exists and no-one else is trying to load it, so we
+		// should try to do so (outside of the mutex lock).
+		if load {
+			if loader == nil {
+				return nil, ErrEntryNotFound
+			}
+
+			return c.tryLoad(ctx, key, loader)
+		}
+
+		// There is an entry and it's valid, return it.
+		return value, nil
+	}
+}
+
+// has checks whether the cache has the given key. Exists only to support tests.
+func (c *LRU) has(key string, checkExpiry bool) bool {
+	c.mut.Lock()
+	defer c.mut.Unlock()
+	entry, exists := c.entries[key]
+
+	if !exists {
+		return false
+	}
+
+	if checkExpiry {
+		return entry.loadingCh != nil || entry.expiresAt.After(c.now())
+	}
+
+	return true
+}
+
+// tryCached returns a value from the cache, or an indication of
+// the caller should do (return an error, load the value, wait for a concurrent
+// load to complete).
+func (c *LRU) tryCached(key string) (interface{}, bool, chan struct{}, error) {
+	c.mut.Lock()
+	defer c.mut.Unlock()
+
+	entry, exists := c.entries[key]
+
+	// If a load is already in progress, tell the caller to wait for it to finish.
+	if exists && entry.loadingCh != nil {
+		return nil, false, entry.loadingCh, nil
+	}
+
+	// If the entry exists and has not expired, it's a hit - return it to the caller
+	if exists && entry.expiresAt.After(c.now()) {
+		c.metrics.hits.Inc(1)
+		c.byAccessTime.MoveToFront(entry.accessTimeElt)
+		return entry.value, false, nil, entry.err
+	}
+
+	// Otherwise we need to load it
+	c.metrics.misses.Inc(1)
+
+	if !exists {
+		// The entry doesn't exist, clear enough space for it and then add it
+		if err := c.reserveCapacity(1); err != nil {
+			return nil, false, nil, err
+		}
+
+		entry = c.newEntry(key)
+	} else {
+		// The entry expired, don't consider it for eviction while we're loading
+		c.byAccessTime.Remove(entry.accessTimeElt)
+		c.byLoadTime.Remove(entry.loadTimeElt)
+	}
+
+	// Create a channel that other callers can block on waiting for this to complete
+	entry.loadingCh = make(chan struct{})
+	return nil, true, nil, nil
+}
+
+// cacheLoadComplete is called when a cache load has completed with either a value or error.
+func (c *LRU) cacheLoadComplete(
+	key string, expiresAt time.Time, value interface{}, err error,
+) (interface{}, error) {
+	c.mut.Lock()
+	defer c.mut.Unlock()
+
+	if err != nil {
+		return c.handleCacheLoadError(key, expiresAt, err)
+	}
+
+	return c.updateCacheEntry(key, expiresAt, value, err)
+}
+
+// handleCacheLoadError handles the results of an error from a cache load. If
+// we are caching errors, updates the cache entry with the error. Otherwise
+// removes the cache entry and returns the (possible unwrapped) error.
+func (c *LRU) handleCacheLoadError(
+	key string, expiresAt time.Time, err error,
+) (interface{}, error) {
+	// If the loader is telling us to cache this error, do so unconditionally
+	var cachedErr CachedError
+	if errors.As(err, &cachedErr) {
+		return c.updateCacheEntry(key, expiresAt, nil, cachedErr.Err)
+	}
+
+	// If the cache is configured to cache errors by default, do so unless
+	// the loader is telling us not to cache this one (e.g. it's transient)
+	var uncachedErr UncachedError
+	isUncachedError := errors.As(err, &uncachedErr)
+	if c.cacheErrors && !isUncachedError {
+		return c.updateCacheEntry(key, expiresAt, nil, err)
+	}
+
+	// Something happened during load, but we don't want to cache this - remove the entry,
+	// tell any blocked callers they can try again, and return the error
+	entry := c.entries[key]
+	c.remove(entry)
+	close(entry.loadingCh)
+	entry.loadingCh = nil
+
+	if isUncachedError {
+		return nil, uncachedErr.Err
+	}
+
+	return nil, err
+}
+
+// updateCacheEntry updates a cache entry with a new value or cached error,
+// and marks it as the most recently accessed and most recently loaded entry
+func (c *LRU) updateCacheEntry(
+	key string, expiresAt time.Time, value interface{}, err error,
+) (interface{}, error) {
+	entry := c.entries[key]
+	entry.value, entry.err = value, err
+
+	// Re-adjust expiration and mark as both most recently access and most recently used
+	if expiresAt.IsZero() {
+		expiresAt = c.now().Add(c.ttl)
+	}
+
+	entry.expiresAt = expiresAt
+	entry.loadTimeElt = c.byLoadTime.PushFront(entry)
+	entry.accessTimeElt = c.byAccessTime.PushFront(entry)
+	c.metrics.entries.Update(float64(len(c.entries)))
+
+	// Tell any other callers that we're done loading
+	close(entry.loadingCh)
+	entry.loadingCh = nil
+	return value, err
+}
+
+// reserveCapacity evicts expired and least recently used entries (that aren't loading)
+// until we have at least enough space for new entries.
+// NB(mmihic): Must be called with the cache mutex locked.
+func (c *LRU) reserveCapacity(n int) error {
+	// Unconditionally evict all expired entries. Entries that are expired by
+	// reloading are not in this list, and therefore will not be evicted.
+	oldestElt := c.byLoadTime.Back()
+	for oldestElt != nil {
+		entry := oldestElt.Value.(*lruCacheEntry)
+		if entry.expiresAt.After(c.now()) {
+			break
+		}
+		c.remove(entry)
+
+		oldestElt = c.byLoadTime.Back()
+	}
+
+	// Evict any recently accessed which are not loading, until we either run out
+	// of entries to evict or we have enough entries.
+	lruElt := c.byAccessTime.Back()
+	for c.maxEntries-len(c.entries) < n && lruElt != nil {
+		c.remove(lruElt.Value.(*lruCacheEntry))
+
+		lruElt = c.byAccessTime.Back()
+	}
+
+	// If we couldn't create enough space, then there are too many entries loading and the cache is simply full
+	if c.maxEntries-len(c.entries) < n {
+		return ErrCacheFull
+	}
+
+	return nil
+}
+
+// load tries to load from the loader.
+// NB(mmihic): Must NOT be called with the cache mutex locked.
+func (c *LRU) tryLoad(
+	ctx context.Context, key string, loader LoaderWithTTLFunc,
+) (interface{}, error) {
+	// If we're limiting overall concurrency, acquire a concurrency lease
+	if c.concurrencyLeases != nil {
+		select {
+		case <-ctx.Done():
+			return c.cacheLoadComplete(key, time.Time{}, nil, UncachedError{ctx.Err()})
+		case <-c.concurrencyLeases:
+		}
+
+		defer func() { c.concurrencyLeases <- struct{}{} }()
+	}
+
+	// Increment load attempts ahead of load so we have metrics for thundering herds blocked in the loader
+	c.metrics.loadAttempts.Inc(1)
+	start := c.now()
+	value, expiresAt, err := loader(ctx, key)
+	c.metrics.loadTimes.RecordDuration(c.now().Sub(start))
+	if err == nil {
+		c.metrics.loadSuccesses.Inc(1)
+	} else {
+		c.metrics.loadFailures.Inc(1)
+	}
+
+	return c.cacheLoadComplete(key, expiresAt, value, err)
+}
+
+// remove removes an entry from the cache.
+// NB(mmihic): Must be called with the cache mutex locked.
+func (c *LRU) remove(entry *lruCacheEntry) {
+	delete(c.entries, entry.key)
+	if entry.accessTimeElt != nil {
+		c.byAccessTime.Remove(entry.accessTimeElt)
+	}
+
+	if entry.loadTimeElt != nil {
+		c.byLoadTime.Remove(entry.loadTimeElt)
+	}
+}
+
+// newEntry creates and adds a new cache entry.
+// NB(mmihic): Must be called with the cache mutex locked.
+func (c *LRU) newEntry(key string) *lruCacheEntry {
+	entry := &lruCacheEntry{key: key}
+	c.entries[key] = entry
+	return entry
+}
+
+type lruCacheEntry struct {
+	key           string
+	accessTimeElt *list.Element
+	loadTimeElt   *list.Element
+	loadingCh     chan struct{}
+	expiresAt     time.Time
+	err           error
+	value         interface{}
+}
+
+type lruCacheMetrics struct {
+	entries       tally.Gauge
+	hits          tally.Counter
+	misses        tally.Counter
+	loadAttempts  tally.Counter
+	loadSuccesses tally.Counter
+	loadFailures  tally.Counter
+	loadTimes     tally.Histogram
+}
+
+var _ Cache = &LRU{}
diff --git a/src/x/cache/lru_cache_test.go b/src/x/cache/lru_cache_test.go
new file mode 100644
index 0000000000..d2ef4eb463
--- /dev/null
+++ b/src/x/cache/lru_cache_test.go
@@ -0,0 +1,638 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package cache
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/m3db/m3/src/x/tallytest"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/uber-go/tally"
+)
+
+func TestLRU_Get_SingleLoadPerKey(t *testing.T) {
+	tt := newLRUTester(3, 0)
+
+	// Spin up a bunch of goroutines to access the cache simultaneously, and
+	// only release once they are all ready.
+	var (
+		wgDone  sync.WaitGroup
+		wgReady sync.WaitGroup
+
+		releaseCh = make(chan struct{})
+	)
+
+	keys := []string{"key-0", "key-1"}
+
+	for i := 0; i < 10; i++ {
+		key := keys[i%len(keys)]
+		wgReady.Add(1)
+		wgDone.Add(1)
+		go func() {
+			defer wgDone.Done()
+
+			// Unblock the triggering goroutine
+			wgReady.Done()
+
+			// Wait for the triggering goroutine to unblock us
+			<-releaseCh
+
+			// Sleep a bit to let other threads wake up
+			time.Sleep(time.Millisecond * 100)
+
+			// Fetch and tell the triggering goroutine that we're done
+			value, err := tt.c.Get(context.Background(), key, tt.defaultLoad)
+			require.NoError(t, err)
+			require.Equal(t, fmt.Sprintf("%s-00001", key), value)
+		}()
+	}
+
+	wgReady.Wait()
+	close(releaseCh)
+	wgDone.Wait()
+
+	// We should only have entered the loader once for each key, even though
+	// multiple goroutines were active simultaneously.
+	assert.Equal(t, int64(1), *tt.callsToLoad["key-0"])
+	assert.Equal(t, int64(1), *tt.callsToLoad["key-1"])
+
+	// Make sure we're reporting proper metrics
+	snapshot := tt.metrics.Snapshot()
+	tallytest.AssertCounterValue(t, 2, snapshot, loadAttemptsCounter, nil)
+	tallytest.AssertCounterValue(t, 2, snapshot, loadsCounter, successTags)
+	tallytest.AssertCounterValue(t, 0, snapshot, loadsCounter, failureTags)
+	tallytest.AssertCounterValue(t, 2, snapshot, accessCounter, missesTags)
+	tallytest.AssertCounterValue(t, 8, snapshot, accessCounter, hitsTags)
+	tallytest.AssertGaugeValue(t, 2, snapshot, entriesGauge, nil)
+}
+
+func TestLRU_Get_HonorsContext(t *testing.T) {
+	tt := newLRUTester(3, 0)
+
+	// Spin up a background goroutines that loads a key.
+	var (
+		blockerCh = make(chan struct{})
+		doneCh    = make(chan struct{})
+	)
+
+	blockedLoad, waitForStartCh := blockingLoad(blockerCh, tt.defaultLoad)
+	go func() {
+		// NB(mmihic): Does not use the cancellation context
+		defer close(doneCh)
+		val, err := tt.c.Get(context.Background(), "key-0", blockedLoad)
+		require.NoError(t, err)
+		require.Equal(t, "key-0-00001", val)
+	}()
+
+	<-waitForStartCh
+
+	// Spin up several more background goroutines that access the same key.
+	// These will block until the main goroutine completes or the context is done.
+	var wg sync.WaitGroup
+	ctx, cancel := context.WithTimeout(context.Background(), time.Hour*24)
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			_, err := tt.c.Get(ctx, "key-0", tt.defaultLoad)
+			require.Equal(t, context.Canceled, err)
+		}()
+	}
+
+	// Cancel the context, the background goroutines should exit ContextCancelled. Wait for them to complete.
+	cancel()
+	wg.Wait()
+
+	// Now let the first goroutine complete.
+	close(blockerCh)
+	<-doneCh
+}
+
+func TestLRU_Get_LimitsTotalConcurrentLoad(t *testing.T) {
+	tt := newLRUTester(10, 5)
+
+	// Spin up 5 blocked goroutines, each for a different key
+	var (
+		blockedChs = make([]chan struct{}, 5)
+		doneChs    = make([]chan struct{}, 5)
+	)
+	for i := 0; i < len(blockedChs); i++ {
+		key := fmt.Sprintf("key-%d", i)
+		doneCh := make(chan struct{})
+
+		blockedChs[i] = make(chan struct{})
+		doneChs[i] = doneCh
+
+		blockingLoadFn, waitForStartCh := blockingLoad(blockedChs[i], tt.defaultLoad)
+		go func() {
+			defer close(doneCh)
+			val, err := tt.c.Get(context.Background(), key, blockingLoadFn)
+			require.NoError(t, err)
+			require.Equal(t, fmt.Sprintf("%s-00001", key), val.(string))
+		}()
+		<-waitForStartCh
+	}
+
+	// Try to acquire a 6th key - this will block since there are no concurrency leases
+	// available. Let it timeout
+	ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*500)
+	defer cancel()
+
+	_, err := tt.c.Get(ctx, "key-9", tt.defaultLoad)
+	require.Error(t, err)
+	assert.Equal(t, err, context.DeadlineExceeded)
+
+	// Release one of the 5 blocked goroutines and wait for it to complete
+	close(blockedChs[0])
+	<-doneChs[0]
+
+	// Try to acquire a 6th key again - this should proceed since we've freed up a lease
+	val, err := tt.c.Get(context.Background(), "key-9", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-9-00001", val)
+
+	// Release the other 5 blocked goroutines
+	for i := 1; i < len(blockedChs); i++ {
+		close(blockedChs[i])
+		<-doneChs[i]
+	}
+}
+
+func TestLRU_Get_EvictsExpiredEntriesPriorToLoading(t *testing.T) {
+	tt := newLRUTester(3, 0)
+	ctx := context.Background()
+
+	// Load 3 entries with enough time between them that we can expire two without expiring the third
+	val, err := tt.c.Get(ctx, "key-0", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-0-00001", val)
+	tt.now = tt.now.Add(time.Minute * 5)
+
+	val, err = tt.c.Get(ctx, "key-1", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-1-00001", val)
+	tt.now = tt.now.Add(time.Minute * 5)
+
+	val, err = tt.c.Get(ctx, "key-2", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-2-00001", val)
+	tt.now = tt.now.Add(time.Minute * 5)
+
+	// Access the oldest expiring entry to make sure that access does not affect expiration
+	for i := 0; i < 10; i++ {
+		val, err = tt.c.Get(ctx, "key-0", tt.defaultLoad)
+		require.NoError(t, err)
+		require.Equal(t, "key-0-00001", val)
+	}
+
+	// Advance time far enough to expire the first two entries
+	tt.now = tt.now.Add(tt.ttl - (time.Minute * 5) - time.Second) // just before the last entries expiration
+
+	// Access a (non-expired) cached entry, should not expiry anything
+	val, err = tt.c.Get(ctx, "key-2", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-2-00001", val)
+	snapshot := tt.metrics.Snapshot()
+	tallytest.AssertGaugeValue(t, 3, snapshot, entriesGauge, nil)
+	assert.True(t, tt.c.has("key-0", false))
+	assert.True(t, tt.c.has("key-1", false))
+	assert.True(t, tt.c.has("key-2", false))
+
+	// Access a new entry, should remove the two expired entries
+	val, err = tt.c.Get(ctx, "key-3", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-3-00001", val)
+	snapshot = tt.metrics.Snapshot()
+	tallytest.AssertGaugeValue(t, 2, snapshot, entriesGauge, nil)
+	assert.False(t, tt.c.has("key-0", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-1", false)) // removed due to expiry
+	assert.True(t, tt.c.has("key-2", false))  // not expired
+	assert.True(t, tt.c.has("key-3", false))  // not expired
+
+	// Spin up a go-routine to load another entry, but let it block in the loading function
+	var (
+		blockerCh = make(chan struct{})
+		doneCh    = make(chan struct{})
+	)
+
+	blockedLoadFn, waitForStartCh := blockingLoad(blockerCh, tt.defaultLoad)
+	go func() {
+		// nolint: govet
+		val, err := tt.c.Get(ctx, "key-4", blockedLoadFn)
+		require.NoError(t, err)
+		require.Equal(t, "key-4-00001", val)
+		close(doneCh)
+	}()
+	<-waitForStartCh
+
+	// Advance time enough that all entries are expired, included the one that's being actively loaded
+	tt.now = tt.now.Add(tt.ttl + time.Second)
+
+	// Access a new entry, will remove all of the expired entries except the one that is currently loading
+	val, err = tt.c.Get(ctx, "key-5", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-5-00001", val)
+	snapshot = tt.metrics.Snapshot()
+	tallytest.AssertGaugeValue(t, 2, snapshot, entriesGauge, nil)
+	assert.False(t, tt.c.has("key-0", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-1", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-2", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-3", false)) // removed due to expiry
+	assert.True(t, tt.c.has("key-4", false))  // technically expired, but not removed due to being in loading state
+	assert.True(t, tt.c.has("key-5", true))   // newly loaded key
+
+	// Allow the load to complete - the newly loaded entry should no longer be expired
+	close(blockerCh)
+	<-doneCh
+	assert.False(t, tt.c.has("key-0", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-1", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-2", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-3", false)) // removed due to expiry
+	assert.True(t, tt.c.has("key-4", true))   // not expired
+	assert.True(t, tt.c.has("key-5", true))   // not expired
+
+	// Advance time so that all entries are expired
+	tt.now = tt.now.Add(tt.ttl + time.Second)
+
+	// Access one of the previously cached entries - since it is expired it should be loaded again properly
+	val, err = tt.c.Get(ctx, "key-3", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-3-00002", val)
+
+	// And ensure that it is not expired after that load
+	assert.False(t, tt.c.has("key-0", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-1", false)) // removed due to expiry
+	assert.False(t, tt.c.has("key-2", false)) // removed due to expiry
+	assert.True(t, tt.c.has("key-3", true))   // no longer expired
+	assert.False(t, tt.c.has("key-4", true))  // has now expired
+	assert.False(t, tt.c.has("key-5", true))  // how now expired
+}
+
+func TestLRU_Get_EvictsLRUEntriesToReserveCapacity(t *testing.T) {
+	tt := newLRUTester(3, 0)
+	ctx := context.Background()
+
+	// Load three entries.
+	val, err := tt.c.Get(ctx, "key-0", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-0-00001", val)
+
+	val, err = tt.c.Get(ctx, "key-1", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-1-00001", val)
+
+	val, err = tt.c.Get(ctx, "key-2", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-2-00001", val)
+
+	// Revisit the second entry to move it to the front of the LRU.
+	val, err = tt.c.Get(ctx, "key-1", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-1-00001", val)
+
+	// Load a fourth and fifth entry - should evict the first and third entry.
+	val, err = tt.c.Get(ctx, "key-3", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-3-00001", val)
+
+	val, err = tt.c.Get(ctx, "key-4", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-4-00001", val)
+
+	assert.False(t, tt.c.has("key-0", false)) // removed due to LRU
+	assert.True(t, tt.c.has("key-1", false))  // was MRU so not removed
+	assert.False(t, tt.c.has("key-2", false)) // removed due to LRU
+	assert.True(t, tt.c.has("key-3", false))  // newly loaded
+	assert.True(t, tt.c.has("key-4", false))  // newly loaded
+
+	// Spin up a blocked background goroutine to load a 6th entry - this will evict the second entry.
+	var (
+		blockerCh = make(chan struct{})
+		wg        sync.WaitGroup
+	)
+
+	wg.Add(1)
+	blockedLoadFn, waitForStartCh := blockingLoad(blockerCh, tt.defaultLoad)
+	go func() {
+		defer wg.Done()
+
+		// nolint: govet
+		val, err := tt.c.Get(ctx, "key-5", blockedLoadFn)
+		require.NoError(t, err)
+		require.Equal(t, "key-5-00001", val)
+	}()
+	<-waitForStartCh
+
+	val, err = tt.c.Get(ctx, "key-3", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-3-00001", val)
+
+	val, err = tt.c.Get(ctx, "key-4", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-4-00001", val)
+
+	assert.False(t, tt.c.has("key-0", false)) // removed due to LRU
+	assert.False(t, tt.c.has("key-1", false)) // removed due to LRU
+	assert.False(t, tt.c.has("key-2", false)) // removed due to LRU
+	assert.True(t, tt.c.has("key-3", false))  // newly loaded
+	assert.True(t, tt.c.has("key-4", false))  // newly loaded
+	assert.True(t, tt.c.has("key-5", false))  // loading
+
+	// Access the 4th key to move it in front of the actively loading key in the LRU
+	val, err = tt.c.Get(ctx, "key-3", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-3-00001", val)
+
+	// Load a 7th and 8th entry, this will evict the fourth and fifth entries. Technically
+	// we've accessed the fourth entry after the 6th entry, but we can't evict the 6th
+	// entry because it is in the process of loading
+	val, err = tt.c.Get(ctx, "key-6", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-6-00001", val)
+
+	val, err = tt.c.Get(ctx, "key-7", tt.defaultLoad)
+	require.NoError(t, err)
+	require.Equal(t, "key-7-00001", val)
+
+	// Spin up other blocked goroutines to reload the first and second entry
+	wg.Add(1)
+	blockedLoadFn, waitForStartCh = blockingLoad(blockerCh, tt.defaultLoad)
+	go func() {
+		defer wg.Done()
+
+		// nolint: govet
+		val, err := tt.c.Get(ctx, "key-0", blockedLoadFn)
+		require.NoError(t, err)
+		require.Equal(t, "key-0-00002", val)
+	}()
+	<-waitForStartCh
+
+	wg.Add(1)
+	blockedLoadFn, waitForStartCh = blockingLoad(blockerCh, tt.defaultLoad)
+	go func() {
+		defer wg.Done()
+
+		// nolint: govet
+		val, err := tt.c.Get(ctx, "key-1", blockedLoadFn)
+		require.NoError(t, err)
+		require.Equal(t, "key-1-00002", val)
+	}()
+	<-waitForStartCh
+
+	// Try to load a 9th entry - this will fail because we cannot evict any of the
+	// entries that are being loaded.
+	_, err = tt.c.Get(ctx, "key-9", tt.defaultLoad)
+	require.Error(t, err)
+	assert.Equal(t, ErrCacheFull, err)
+
+	// Let the background loads complete, then re-attempt the 9th entry - this
+	// will evict the 7th entry
+	close(blockerCh)
+}
+
+func TestLRU_Get_CacheLoadErrors(t *testing.T) {
+	loadAttempts := map[string]int{}
+
+	now := time.Date(2020, time.August, 22, 14, 56, 17, 100, time.UTC)
+
+	c := NewLRU(&LRUOptions{
+		TTL:                  time.Second * 30,
+		CacheErrorsByDefault: true,
+		Now:                  func() time.Time { return now },
+	})
+
+	loader := func(_ context.Context, key string) (interface{}, error) {
+		loadAttempts[key]++
+
+		switch key {
+		case "key-1":
+			return nil, errors.New("this failed")
+		case "key-2":
+			return "foo", nil
+		case "key-3":
+			return nil, UncachedError{errors.New("this also failed")}
+		default:
+			return nil, ErrEntryNotFound
+		}
+	}
+
+	// Load a key which generates an error
+	_, err := c.Get(context.Background(), "key-1", loader)
+	require.EqualError(t, err, "this failed")
+
+	// Access it a few more times - the error should be cached
+	for i := 0; i < 10; i++ {
+		_, err = c.Get(context.Background(), "key-1", loader)
+		require.EqualError(t, err, "this failed")
+	}
+
+	// Should only have been loaded once despite resulting in an error
+	assert.Equal(t, 1, loadAttempts["key-1"])
+
+	// Load a key which doesn't generate an error - this should still be triggered
+	_, err = c.Get(context.Background(), "key-1", loader)
+	require.EqualError(t, err, "this failed")
+
+	// Load a key which doesn't exist - this should be triggered and the result cached
+	_, err = c.Get(context.Background(), "non-existent", loader)
+	require.Equal(t, ErrEntryNotFound, err)
+
+	for i := 0; i < 10; i++ {
+		_, err = c.Get(context.Background(), "non-existent", loader)
+		require.Equal(t, ErrEntryNotFound, err)
+	}
+
+	assert.Equal(t, 1, loadAttempts["non-existent"])
+
+	// Advance past the TTL and re-access the key that generated an error - should reload that key
+	now = now.Add(time.Hour * 10)
+	_, err = c.Get(context.Background(), "key-1", loader)
+	require.EqualError(t, err, "this failed")
+	assert.Equal(t, 2, loadAttempts["key-1"])
+
+	// Load a key that results in an error that we are explicitly not caching - should constantly
+	// attempt to reload that key
+	for i := 0; i < 10; i++ {
+		_, err = c.Get(context.Background(), "key-3", loader)
+		require.EqualError(t, err, "this also failed")
+		require.False(t, errors.As(err, &UncachedError{})) // should have been unwrapped
+	}
+	assert.Equal(t, 10, loadAttempts["key-3"])
+}
+
+func TestLRU_Get_DontCacheLoadErrors(t *testing.T) {
+	loadAttempts := map[string]int{}
+	c := NewLRU(&LRUOptions{
+		TTL:                  time.Second * 30,
+		CacheErrorsByDefault: false,
+	})
+
+	loader := func(_ context.Context, key string) (interface{}, error) {
+		loadAttempts[key]++
+
+		if key == "always-cached" {
+			return nil, &CachedError{errors.New("this failed")}
+		}
+
+		if key == "always-uncached" {
+			return nil, &UncachedError{errors.New("this failed")}
+		}
+
+		return nil, errors.New("this failed")
+	}
+
+	// No matter how many times we access the erroring key, we'll keep going back to the loader
+	for i := 0; i < 10; i++ {
+		_, err := c.Get(context.Background(), "key-1", loader)
+		require.EqualError(t, err, "this failed")
+		require.False(t, errors.As(err, &UncachedError{}))
+		require.False(t, errors.As(err, &CachedError{}))
+	}
+	assert.Equal(t, 10, loadAttempts["key-1"])
+
+	// Allow explicit caching even when caching is disabled by default
+	for i := 0; i < 10; i++ {
+		_, err := c.Get(context.Background(), "always-cached", loader)
+		require.EqualError(t, err, "this failed")
+		require.False(t, errors.As(err, &UncachedError{}))
+		require.False(t, errors.As(err, &CachedError{}))
+	}
+	assert.Equal(t, 1, loadAttempts["always-cached"])
+
+	// Still unwrap uncached errors even when caching is disabled
+	for i := 0; i < 10; i++ {
+		_, err := c.Get(context.Background(), "always-uncached", loader)
+		require.EqualError(t, err, "this failed")
+		require.False(t, errors.As(err, &UncachedError{}))
+		require.False(t, errors.As(err, &CachedError{}))
+	}
+	assert.Equal(t, 10, loadAttempts["always-uncached"])
+}
+
+func TestLRU_GetWithTTL_AllowEntrySpecificTTLs(t *testing.T) {
+	var (
+		loadAttempts = 0
+		now          = time.Date(2020, time.August, 22, 14, 56, 17, 100, time.UTC)
+		loader       = func(_ context.Context, key string) (interface{}, time.Time, error) {
+			loadAttempts++
+			return fmt.Sprintf("%s-%05d", key, loadAttempts), now.Add(time.Hour * 24), nil
+		}
+	)
+
+	c := NewLRU(&LRUOptions{
+		TTL: time.Second * 30,
+		Now: func() time.Time {
+			return now
+		},
+	})
+
+	// Repeatedly load, returning a custom TTL, advancing time past the "default" TTL but
+	// still within the TTL returned from the load function - should not reload
+	for i := 0; i < 10; i++ {
+		val, err := c.GetWithTTL(context.Background(), "my-key", loader)
+		require.NoError(t, err)
+		assert.Equal(t, "my-key-00001", val)
+		assert.Equal(t, 1, loadAttempts)
+		now = now.Add(time.Minute)
+	}
+
+	// Advance past the TTL returned from the loader and try again - should reload
+	now = now.Add(time.Hour * 72)
+	val, err := c.GetWithTTL(context.Background(), "my-key", loader)
+	require.NoError(t, err)
+	assert.Equal(t, "my-key-00002", val)
+	assert.Equal(t, 2, loadAttempts)
+}
+
+var defaultKeys = []string{
+	"key-0", "key-1", "key-2", "key-3", "key-4", "key-5", "key-6", "key-7", "key-8", "key-9", "key10",
+}
+
+type lruTester struct {
+	c           *LRU
+	callsToLoad map[string]*int64
+	now         time.Time
+	ttl         time.Duration
+	metrics     tally.TestScope
+}
+
+// newLRUTester creates a new tester for covering LRU cache functionality
+func newLRUTester(maxEntries, maxConcurrency int) *lruTester {
+	tt := &lruTester{
+		ttl:         time.Minute * 30,
+		now:         time.Date(2020, time.April, 13, 22, 15, 35, 200, time.UTC),
+		callsToLoad: make(map[string]*int64, len(defaultKeys)),
+		metrics:     tally.NewTestScope("", nil),
+	}
+
+	for _, key := range defaultKeys {
+		var i int64
+		tt.callsToLoad[key] = &i
+	}
+
+	cacheOpts := &LRUOptions{
+		MaxEntries:     maxEntries,
+		TTL:            tt.ttl,
+		Metrics:        tt.metrics,
+		MaxConcurrency: maxConcurrency,
+		Now:            func() time.Time { return tt.now }, // use the test time
+	}
+
+	tt.c = NewLRU(cacheOpts)
+	return tt
+}
+
+// defaultLoad is the default implementation of a loader for a cache
+func (tt *lruTester) defaultLoad(_ context.Context, key string) (interface{}, error) {
+	callPtr := tt.callsToLoad[key]
+	if callPtr == nil {
+		return nil, ErrEntryNotFound
+	}
+
+	calls := atomic.AddInt64(callPtr, 1)
+	return fmt.Sprintf("%s-%05d", key, calls), nil
+}
+
+// blockingLoad wraps a load function with one that blocks until the
+// provided channel is closed. Returns a channel that the caller can wait on
+// to ensure that the load function has been entered.
+func blockingLoad(blockerCh chan struct{}, loader LoaderFunc) (LoaderFunc, chan struct{}) {
+	// Channel to block the caller until the loader has been called
+	loadFnEnteredCh := make(chan struct{})
+	return func(ctx context.Context, key string) (interface{}, error) {
+		close(loadFnEnteredCh)
+		select {
+		case <-ctx.Done():
+			return nil, UncachedError{ctx.Err()}
+		case <-blockerCh:
+		}
+
+		return loader(ctx, key)
+	}, loadFnEnteredCh
+}
diff --git a/src/x/cache/nop_cache.go b/src/x/cache/nop_cache.go
new file mode 100644
index 0000000000..8d64287f65
--- /dev/null
+++ b/src/x/cache/nop_cache.go
@@ -0,0 +1,47 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package cache
+
+import (
+	"context"
+	"time"
+)
+
+// NewNop returns a new nop cache.
+func NewNop() Cache {
+	return &nopCache{}
+}
+
+type nopCache struct{}
+
+func (n *nopCache) Put(_ string, _ interface{})                           {}
+func (n *nopCache) PutWithTTL(_ string, _ interface{}, ttl time.Duration) {}
+
+func (n *nopCache) Get(ctx context.Context, key string, loader LoaderFunc) (interface{}, error) {
+	return loader(ctx, key)
+}
+
+func (n *nopCache) GetWithTTL(ctx context.Context, key string, loader LoaderWithTTLFunc) (interface{}, error) {
+	val, _, err := loader(ctx, key)
+	return val, err
+}
+
+var _ Cache = &nopCache{}
diff --git a/src/x/tallytest/tallytest.go b/src/x/tallytest/tallytest.go
new file mode 100644
index 0000000000..2d8669e17a
--- /dev/null
+++ b/src/x/tallytest/tallytest.go
@@ -0,0 +1,114 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package tallytest
+
+import (
+	"fmt"
+	"sort"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/uber-go/tally"
+)
+
+// AssertCounterValue asserts that the given counter has the expected value.
+func AssertCounterValue(t *testing.T, expected int64, s tally.Snapshot, name string, tags map[string]string) bool {
+	index := flattenMetricIndex(name, tags)
+	counter := s.Counters()[index]
+	notFound := fmt.Sprintf("not found: key=%s, actual=%v", index, counterKeys(s.Counters()))
+	if !assert.NotNil(t, counter, notFound) {
+		return false
+	}
+	mismatch := fmt.Sprintf("current values: %v", counterMap(s.Counters()))
+	return assert.Equal(t, expected, counter.Value(), mismatch)
+}
+
+// AssertGaugeValue asserts that the given gauge has the expected value.
+func AssertGaugeValue(t *testing.T, expected float64, s tally.Snapshot, name string, tags map[string]string) bool {
+	index := flattenMetricIndex(name, tags)
+	gauge := s.Gauges()[index]
+	notFound := fmt.Sprintf("not found: key=%s, actual=%v", index, gaugeKeys(s.Gauges()))
+	if !assert.NotNil(t, gauge, notFound) {
+		return false
+	}
+	mismatch := fmt.Sprintf("current values: %v", gaugeMap(s.Gauges()))
+	return assert.InDelta(t, expected, gauge.Value(), 0.0001, mismatch)
+}
+
+// AssertGaugeNil asserts that the given gauge does not exist.
+func AssertGaugeNil(t *testing.T, s tally.Snapshot, name string, tags map[string]string) bool {
+	index := flattenMetricIndex(name, tags)
+	gauge := s.Gauges()[index]
+	found := fmt.Sprintf("found: key=%s, actual=%v", index, gaugeKeys(s.Gauges()))
+	return assert.Nil(t, gauge, found)
+}
+
+func flattenMetricIndex(name string, tags map[string]string) string {
+	keys := make([]string, 0, len(tags))
+	for k := range tags {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	index := name + "+"
+	for i, k := range keys {
+		sep := ""
+		if i != 0 {
+			sep = ","
+		}
+
+		index += fmt.Sprintf("%s%s=%s", sep, k, tags[k])
+	}
+
+	return index
+}
+
+func counterMap(m map[string]tally.CounterSnapshot) map[string]int64 {
+	result := make(map[string]int64, len(m))
+	for k, v := range m {
+		result[k] = v.Value()
+	}
+	return result
+}
+
+func gaugeMap(m map[string]tally.GaugeSnapshot) map[string]float64 {
+	result := make(map[string]float64, len(m))
+	for k, v := range m {
+		result[k] = v.Value()
+	}
+	return result
+}
+
+func counterKeys(m map[string]tally.CounterSnapshot) []string {
+	r := make([]string, 0, len(m))
+	for k := range m {
+		r = append(r, k)
+	}
+	return r
+}
+
+func gaugeKeys(m map[string]tally.GaugeSnapshot) []string {
+	r := make([]string, 0, len(m))
+	for k := range m {
+		r = append(r, k)
+	}
+	return r
+}

From a6f05a73538d01db2808598ba9feb25c392b1569 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 23 Oct 2020 17:27:32 -0400
Subject: [PATCH 003/106] Revert go.mod

---
 go.mod | 1 -
 1 file changed, 1 deletion(-)

diff --git a/go.mod b/go.mod
index 85c0b9108b..23e7824ff4 100644
--- a/go.mod
+++ b/go.mod
@@ -42,7 +42,6 @@ require (
 	github.com/gorilla/handlers v1.4.2 // indirect
 	github.com/gorilla/mux v1.7.3
 	github.com/gotestyourself/gotestyourself v2.2.0+incompatible // indirect
-	github.com/hashicorp/golang-lru v0.5.4
 	github.com/hashicorp/hcl v1.0.1-0.20190611123218-cf7d376da96d // indirect
 	github.com/hydrogen18/stalecucumber v0.0.0-20151102144322-9b38526d4bdf
 	github.com/influxdata/influxdb v1.7.7

From c8683af284a6ff80c1e3562c96f62d71e1265937 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 25 Oct 2020 17:40:17 -0400
Subject: [PATCH 004/106] Use read only bitmap by default in FST segment

---
 .../fst/fst_terms_postings_iterator.go        |  25 +-
 src/m3ninx/index/segment/fst/segment.go       |  21 +-
 .../postings/roaring/bitmap_readonly.go       | 432 ++++++++++++++++++
 .../postings/roaring/bitmap_readonly_test.go  | 123 +++++
 src/m3ninx/postings/roaring/roaring_test.go   |  15 -
 src/m3ninx/postings/types.go                  |   8 +-
 src/x/cache/lru_cache.go                      |  29 +-
 src/x/cache/lru_cache_test.go                 |  10 +
 8 files changed, 596 insertions(+), 67 deletions(-)
 create mode 100644 src/m3ninx/postings/roaring/bitmap_readonly.go
 create mode 100644 src/m3ninx/postings/roaring/bitmap_readonly_test.go

diff --git a/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go b/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
index b0c2c25224..6cd4e1eaf5 100644
--- a/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
+++ b/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
@@ -23,24 +23,11 @@ package fst
 import (
 	sgmt "github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
-	postingsroaring "github.com/m3db/m3/src/m3ninx/postings/roaring"
-	"github.com/m3dbx/pilosa/roaring"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 )
 
-// postingsIterRoaringPoolingConfig uses a configuration that avoids allocating
-// any containers in the roaring bitmap, since these roaring bitmaps are backed
-// by mmaps and don't have any native containers themselves.
-var postingsIterRoaringPoolingConfig = roaring.ContainerPoolingConfiguration{
-	MaxArraySize:                    0,
-	MaxRunsSize:                     0,
-	AllocateBitmap:                  false,
-	MaxCapacity:                     0,
-	MaxKeysAndContainersSliceLength: 128 * 10,
-}
-
 type fstTermsPostingsIter struct {
-	bitmap   *roaring.Bitmap
-	postings postings.List
+	bitmap *roaring.ReadOnlyBitmap
 
 	seg       *fsSegment
 	termsIter *fstTermsIter
@@ -49,10 +36,8 @@ type fstTermsPostingsIter struct {
 }
 
 func newFSTTermsPostingsIter() *fstTermsPostingsIter {
-	bitmap := roaring.NewBitmapWithPooling(postingsIterRoaringPoolingConfig)
 	i := &fstTermsPostingsIter{
-		bitmap:   bitmap,
-		postings: postingsroaring.NewPostingsListFromBitmap(bitmap),
+		bitmap: &roaring.ReadOnlyBitmap{},
 	}
 	i.clear()
 	return i
@@ -61,7 +46,7 @@ func newFSTTermsPostingsIter() *fstTermsPostingsIter {
 var _ sgmt.TermsIterator = &fstTermsPostingsIter{}
 
 func (f *fstTermsPostingsIter) clear() {
-	f.bitmap.Reset()
+	f.bitmap.Reset(nil)
 	f.seg = nil
 	f.termsIter = nil
 	f.currTerm = nil
@@ -100,7 +85,7 @@ func (f *fstTermsPostingsIter) Next() bool {
 }
 
 func (f *fstTermsPostingsIter) Current() ([]byte, postings.List) {
-	return f.currTerm, f.postings
+	return f.currTerm, f.bitmap
 }
 
 func (f *fstTermsPostingsIter) Err() error {
diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index 7d0efdd0a5..ea0430b937 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -34,14 +34,12 @@ import (
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding"
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding/docs"
 	"github.com/m3db/m3/src/m3ninx/postings"
-	"github.com/m3db/m3/src/m3ninx/postings/pilosa"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/x"
 	"github.com/m3db/m3/src/x/context"
 	xerrors "github.com/m3db/m3/src/x/errors"
 	"github.com/m3db/m3/src/x/mmap"
 
-	pilosaroaring "github.com/m3dbx/pilosa/roaring"
 	"github.com/m3dbx/vellum"
 )
 
@@ -387,17 +385,7 @@ func (i *termsIterable) termsNotClosedMaybeFinalizedWithRLock(
 	return i.postingsIter, nil
 }
 
-func (r *fsSegment) UnmarshalPostingsListBitmap(b *pilosaroaring.Bitmap, offset uint64) error {
-	r.RLock()
-	defer r.RUnlock()
-	if r.closed {
-		return errReaderClosed
-	}
-
-	return r.unmarshalPostingsListBitmapNotClosedMaybeFinalizedWithLock(b, offset)
-}
-
-func (r *fsSegment) unmarshalPostingsListBitmapNotClosedMaybeFinalizedWithLock(b *pilosaroaring.Bitmap, offset uint64) error {
+func (r *fsSegment) unmarshalPostingsListBitmapNotClosedMaybeFinalizedWithLock(b *roaring.ReadOnlyBitmap, offset uint64) error {
 	if r.finalized {
 		return errReaderFinalized
 	}
@@ -407,8 +395,7 @@ func (r *fsSegment) unmarshalPostingsListBitmapNotClosedMaybeFinalizedWithLock(b
 		return fmt.Errorf("unable to retrieve postings data: %v", err)
 	}
 
-	b.Reset()
-	return b.UnmarshalBinary(postingsBytes)
+	return b.Reset(postingsBytes)
 }
 
 func (r *fsSegment) matchFieldNotClosedMaybeFinalizedWithRLock(
@@ -630,8 +617,8 @@ func (r *fsSegment) retrievePostingsListWithRLock(postingsOffset uint64) (postin
 	if err != nil {
 		return nil, fmt.Errorf("unable to retrieve postings data: %v", err)
 	}
-
-	return pilosa.Unmarshal(postingsBytes)
+	// Read only bitmap is a very low allocation postings list.
+	return roaring.NewReadOnlyBitmap(postingsBytes)
 }
 
 func (r *fsSegment) retrieveTermsFSTWithRLock(field []byte) (*vellum.FST, bool, error) {
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
new file mode 100644
index 0000000000..50bd077ffb
--- /dev/null
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -0,0 +1,432 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package roaring
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"math"
+	"math/bits"
+	"sort"
+	"unsafe"
+
+	"github.com/m3db/m3/src/m3ninx/postings"
+)
+
+var (
+	errNotPilosaRoaring = errors.New("not pilosa roaring format")
+
+	headerBaseSize     = uint64(8)
+	magicNumber        = uint32(12348)
+	storageVersion     = uint32(0)
+	bitmapN            = 1024
+	runCountHeaderSize = uint32(2)
+)
+
+type containerType byte
+
+const (
+	containerUnknown containerType = iota
+	containerArray
+	containerBitmap
+	containerRun
+)
+
+var _ postings.List = (*ReadOnlyBitmap)(nil)
+
+// ReadOnlyBitmap is a read only roaring Bitmap of
+// pilosa encoded roaring bitmaps, allocates very little on unmarshal
+// except the number of keys and no allocations occur per operation
+// except when creating an iterator (which allocates just the iterator,
+// there are no allocations after the creation of the iterator).
+type ReadOnlyBitmap struct {
+	data []byte
+	keyN uint64
+}
+
+// NewReadOnlyBitmap returns a new read only bitmap.
+func NewReadOnlyBitmap(data []byte) (*ReadOnlyBitmap, error) {
+	b := &ReadOnlyBitmap{}
+	if err := b.Reset(data); err != nil {
+		return nil, err
+	}
+	return b, nil
+}
+
+func (b *ReadOnlyBitmap) Reset(data []byte) error {
+	if len(data) == 0 {
+		// Reset to nil
+		b.data = nil
+		b.keyN = 0
+		return nil
+	}
+
+	if n := len(data); uint64(n) < headerBaseSize {
+		return fmt.Errorf("must be at least %d bytes: actual=%d",
+			headerBaseSize, n)
+	}
+
+	fileMagic := uint32(binary.LittleEndian.Uint16(data[0:2]))
+	fileVersion := uint32(binary.LittleEndian.Uint16(data[2:4]))
+	if fileMagic != magicNumber {
+		return fmt.Errorf("invalid roaring file, magic number %v is incorrect",
+			fileMagic)
+	}
+
+	if fileVersion != storageVersion {
+		return fmt.Errorf("wrong roaring version, file is v%d, server requires v%d",
+			fileVersion, storageVersion)
+	}
+
+	// Read key count in bytes sizeof(cookie):(sizeof(cookie)+sizeof(uint32)).
+	keyN := uint64(binary.LittleEndian.Uint32(data[4:8]))
+
+	minBytesN := headerBaseSize + keyN*12 + keyN*4
+	if uint64(len(data)) < minBytesN {
+		return fmt.Errorf("bitmap too small: need=%d, actual=%d",
+			minBytesN, len(data))
+	}
+
+	b.data = data
+	b.keyN = keyN
+	return nil
+}
+
+type readOnlyContainer struct {
+	data          []byte
+	key           uint64
+	containerType containerType
+	cardinality   uint16
+	offset        uint32
+}
+
+type bitmapReadOnlyContainer struct {
+	values []uint64
+}
+
+func (b bitmapReadOnlyContainer) contains(v uint16) bool {
+	return (b.values[v/64] & (1 << uint64(v%64))) != 0
+}
+
+type arrayReadOnlyContainer struct {
+	values []uint16
+}
+
+func (a arrayReadOnlyContainer) contains(v uint16) bool {
+	n := len(a.values)
+	idx := sort.Search(n, func(i int) bool {
+		return a.values[i] >= v
+	})
+	return idx < n && a.values[idx] == v
+}
+
+type runReadOnlyContainer struct {
+	values []interval16
+}
+
+func (r runReadOnlyContainer) contains(v uint16) bool {
+	n := len(r.values)
+	idx := sort.Search(n, func(i int) bool {
+		return r.values[i].last >= v
+	})
+	return idx < n && v >= r.values[idx].start && v <= r.values[idx].last
+}
+
+func (c readOnlyContainer) bitmap() (bitmapReadOnlyContainer, bool) {
+	if c.containerType != containerBitmap {
+		return bitmapReadOnlyContainer{}, false
+	}
+	return bitmapReadOnlyContainer{
+		values: (*[0xFFFFFFF]uint64)(unsafe.Pointer(&c.data[c.offset]))[:bitmapN:bitmapN],
+	}, true
+}
+
+func (c readOnlyContainer) array() (arrayReadOnlyContainer, bool) {
+	if c.containerType != containerArray {
+		return arrayReadOnlyContainer{}, false
+	}
+	return arrayReadOnlyContainer{
+		values: (*[0xFFFFFFF]uint16)(unsafe.Pointer(&c.data[c.offset]))[:c.cardinality:c.cardinality],
+	}, true
+}
+
+func (c readOnlyContainer) runs() (runReadOnlyContainer, bool) {
+	if c.containerType != containerRun {
+		return runReadOnlyContainer{}, false
+	}
+	runCount := binary.LittleEndian.Uint16(c.data[c.offset : c.offset+runCountHeaderSize])
+	return runReadOnlyContainer{
+		values: (*[0xFFFFFFF]interval16)(unsafe.Pointer(&c.data[c.offset+runCountHeaderSize]))[:runCount:runCount],
+	}, true
+}
+
+type interval16 struct {
+	start uint16
+	last  uint16
+}
+
+func (i interval16) n() uint16 {
+	return i.last - i.start
+}
+
+func (b *ReadOnlyBitmap) container(key uint64) (readOnlyContainer, bool) {
+	index, ok := b.indexOfKey(key)
+	if !ok {
+		return readOnlyContainer{}, false
+	}
+	return b.containerAtIndex(index), true
+}
+
+func (b *ReadOnlyBitmap) containerAtIndex(index uint64) readOnlyContainer {
+	meta := b.data[headerBaseSize+index*12:]
+	offsets := b.data[headerBaseSize+b.keyN*12+index*4:]
+	return readOnlyContainer{
+		data:          b.data,
+		key:           b.keyAtIndex(int(index)),
+		containerType: containerType(binary.LittleEndian.Uint16(meta[8:10])),
+		cardinality:   uint16(binary.LittleEndian.Uint16(meta[10:12])) + 1,
+		offset:        binary.LittleEndian.Uint32(offsets[0:4]),
+	}
+}
+
+func (b *ReadOnlyBitmap) Contains(id postings.ID) bool {
+	value := uint64(id)
+	container, ok := b.container(highbits(value))
+	if !ok {
+		return false
+	}
+	if bitmap, ok := container.bitmap(); ok {
+		return bitmap.contains(lowbits(value))
+	}
+	if array, ok := container.array(); ok {
+		return array.contains(lowbits(value))
+	}
+	if runs, ok := container.runs(); ok {
+		return runs.contains(lowbits(value))
+	}
+	return false
+}
+
+func (b *ReadOnlyBitmap) IsEmpty() bool {
+	return b.keyN == 0
+}
+
+func (b *ReadOnlyBitmap) Len() int {
+	l := 0
+	for i := uint64(0); i < b.keyN; i++ {
+		l += int(b.containerAtIndex(i).cardinality)
+	}
+	return l
+}
+
+func (b *ReadOnlyBitmap) Iterator() postings.Iterator {
+	return newReadOnlyBitmapIterator(b)
+}
+
+func (b *ReadOnlyBitmap) Equal(other postings.List) bool {
+	if b.Len() != other.Len() {
+		return false
+	}
+	iter := b.Iterator()
+	otherIter := other.Iterator()
+	for iter.Next() {
+		if !otherIter.Next() {
+			return false
+		}
+		if iter.Current() != otherIter.Current() {
+			return false
+		}
+	}
+	return true
+}
+
+func (b *ReadOnlyBitmap) keyAtIndex(index int) uint64 {
+	meta := b.data[int(headerBaseSize)+index*12:]
+	return binary.LittleEndian.Uint64(meta[0:8])
+}
+
+func (b *ReadOnlyBitmap) indexOfKey(value uint64) (uint64, bool) {
+	n := int(b.keyN)
+	idx := sort.Search(n, func(i int) bool {
+		return b.keyAtIndex(i) >= value
+	})
+	if idx < n && b.keyAtIndex(idx) == value {
+		return uint64(idx), true
+	}
+	return 0, false
+}
+
+func highbits(v uint64) uint64 { return v >> 16 }
+func lowbits(v uint64) uint16  { return uint16(v & 0xFFFF) }
+
+var _ postings.Iterator = (*readOnlyBitmapIterator)(nil)
+
+type readOnlyBitmapIterator struct {
+	b                  *ReadOnlyBitmap
+	containerIndex     int
+	containerExhausted bool
+	container          readOnlyContainer
+	containerState     readOnlyBitmapIteratorContainerState
+	currValue          uint64
+}
+
+type readOnlyBitmapIteratorContainerState struct {
+	entryIndex       int
+	bitmap           []uint64
+	bitmapCurr       uint64
+	bitmapCurrBase   uint64
+	bitmapCurrShifts uint64
+	array            []uint16
+	runs             []interval16
+	runsCurr         interval16
+	runsIndex        uint64
+}
+
+func newReadOnlyBitmapIterator(
+	b *ReadOnlyBitmap,
+) *readOnlyBitmapIterator {
+	return &readOnlyBitmapIterator{
+		b:                  b,
+		containerIndex:     -1,
+		containerExhausted: true,
+	}
+}
+
+func (i *readOnlyBitmapIterator) setContainer(c readOnlyContainer) {
+	i.container = c
+
+	i.containerState.entryIndex = -1
+
+	bitmap, _ := c.bitmap()
+	i.containerState.bitmap = bitmap.values
+	i.containerState.bitmapCurr = 0
+	i.containerState.bitmapCurrBase = 0
+	i.containerState.bitmapCurrShifts = 0
+
+	array, _ := c.array()
+	i.containerState.array = array.values
+
+	runs, _ := c.runs()
+	i.containerState.runs = runs.values
+	i.containerState.runsCurr = interval16{}
+	i.containerState.runsIndex = math.MaxUint64
+}
+
+func (i *readOnlyBitmapIterator) Next() bool {
+	if i.containerIndex >= int(i.b.keyN) {
+		// Already exhausted.
+		return false
+	}
+
+	if i.containerExhausted {
+		// Container exhausted.
+		i.containerIndex++
+		if i.containerIndex >= int(i.b.keyN) {
+			return false
+		}
+		i.containerExhausted = false
+		i.setContainer(i.b.containerAtIndex(uint64(i.containerIndex)))
+	}
+
+	if i.container.containerType == containerBitmap {
+		// Bitmap container.
+		for i.containerState.bitmapCurr == 0 {
+			// All zero bits, progress to next uint64.
+			i.containerState.entryIndex++
+			if i.containerState.entryIndex >= len(i.containerState.bitmap) {
+				// Move to next container.
+				i.containerExhausted = true
+				return i.Next()
+			}
+
+			i.containerState.bitmapCurr = i.containerState.bitmap[i.containerState.entryIndex]
+			i.containerState.bitmapCurrBase = uint64(64 * i.containerState.entryIndex)
+			i.containerState.bitmapCurrShifts = 0
+		}
+
+		// Non-zero bitmap uint64, work out next bit set and add together with
+		// base and current shifts made within this bitmap.
+		firstBitSet := uint64(bits.TrailingZeros64(i.containerState.bitmapCurr))
+		bitmapValue := i.containerState.bitmapCurrBase +
+			i.containerState.bitmapCurrShifts +
+			firstBitSet
+
+		// Now shift for the next value.
+		shifts := firstBitSet + 1
+		i.containerState.bitmapCurr = i.containerState.bitmapCurr >> shifts
+		i.containerState.bitmapCurrShifts += shifts
+
+		i.currValue = i.container.key<<16 | bitmapValue
+		return true
+	}
+
+	if i.container.containerType == containerArray {
+		// Array container.
+		i.containerState.entryIndex++
+		idx := i.containerState.entryIndex
+		if idx >= len(i.containerState.array) {
+			// Move to next container.
+			i.containerExhausted = true
+			return i.Next()
+		}
+		i.currValue = i.container.key<<16 | uint64(i.containerState.array[idx])
+		return true
+	}
+
+	if i.container.containerType == containerRun {
+		// Run container.
+		if i.containerState.runsIndex > uint64(i.containerState.runsCurr.last) {
+			// No more values left in the run, progress to next run.
+			i.containerState.entryIndex++
+			idx := i.containerState.entryIndex
+			if idx >= len(i.containerState.runs) {
+				// Move to next container.
+				i.containerExhausted = true
+				return i.Next()
+			}
+
+			i.containerState.runsCurr = i.containerState.runs[i.containerState.entryIndex]
+			i.containerState.runsIndex = uint64(i.containerState.runsCurr.start)
+		}
+
+		runValue := i.containerState.runsIndex
+		i.containerState.runsIndex++
+
+		i.currValue = i.container.key<<16 | runValue
+		return true
+	}
+
+	i.containerExhausted = true
+	return false
+}
+
+func (i *readOnlyBitmapIterator) Current() postings.ID {
+	return postings.ID(i.currValue)
+}
+
+func (i *readOnlyBitmapIterator) Err() error {
+	return nil
+}
+
+func (i *readOnlyBitmapIterator) Close() error {
+	return nil
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly_test.go b/src/m3ninx/postings/roaring/bitmap_readonly_test.go
new file mode 100644
index 0000000000..9db96e1307
--- /dev/null
+++ b/src/m3ninx/postings/roaring/bitmap_readonly_test.go
@@ -0,0 +1,123 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package roaring
+
+import (
+	"bytes"
+	"fmt"
+	"math/rand"
+	"testing"
+
+	"github.com/m3dbx/pilosa/roaring"
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	readOnlySeed int64 = 123456789
+)
+
+func TestReadOnlyBitmap(t *testing.T) {
+	buff := bytes.NewBuffer(nil)
+
+	rng := rand.New(rand.NewSource(seed))
+
+	each := 8
+	tests := []struct {
+		attempts    int
+		insertCount int
+		insertRange int
+	}{
+		// 64 inserts
+		{
+			insertCount: 64,
+			insertRange: 64,
+		},
+		{
+			insertCount: 64,
+			insertRange: 128,
+		},
+		{
+			insertCount: 64,
+			insertRange: 256,
+		},
+		// 4096 inserts
+		{
+			insertCount: 4096,
+			insertRange: 4096,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 8192,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 16384,
+		},
+		// 65536 inserts
+		{
+			insertCount: 65536,
+			insertRange: 65536,
+		},
+		{
+			insertCount: 65536,
+			insertRange: 131072,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 262144,
+		},
+	}
+
+	b := roaring.NewBitmapWithDefaultPooling(2 << 15) // 2^16 containers max, will stay within [0,2^32)
+	for _, test := range tests {
+		for i := 0; i < each; i++ {
+			t.Run(fmt.Sprintf("attempt=%d, test=+%v", i, test), func(t *testing.T) {
+				b.Reset()
+				max := uint64(rng.Int63n(int64(test.insertRange)))
+				for j := 0; j < test.insertCount; j++ {
+					value := rng.Uint64() % max
+					b.DirectAdd(value)
+				}
+
+				list := NewPostingsListFromBitmap(b)
+
+				buff.Reset()
+				_, err := b.WriteTo(buff)
+				require.NoError(t, err)
+
+				readOnly, err := NewReadOnlyBitmap(buff.Bytes())
+				require.NoError(t, err)
+
+				// Check for equality.
+				require.True(t, readOnly.Equal(list))
+
+				// Check for contains.
+				iter := list.Iterator()
+				for iter.Next() {
+					curr := iter.Current()
+					require.True(t, readOnly.Contains(curr))
+				}
+				require.NoError(t, iter.Err())
+				require.NoError(t, iter.Close())
+			})
+		}
+	}
+}
diff --git a/src/m3ninx/postings/roaring/roaring_test.go b/src/m3ninx/postings/roaring/roaring_test.go
index 404b1a3a3a..9ada7f122d 100644
--- a/src/m3ninx/postings/roaring/roaring_test.go
+++ b/src/m3ninx/postings/roaring/roaring_test.go
@@ -35,21 +35,6 @@ func TestRoaringPostingsListEmpty(t *testing.T) {
 	require.Equal(t, 0, d.Len())
 }
 
-func TestRoaringPostingsListMax(t *testing.T) {
-	d := NewPostingsList()
-	require.NoError(t, d.Insert(42))
-	require.NoError(t, d.Insert(78))
-	require.NoError(t, d.Insert(103))
-
-	max, err := d.Max()
-	require.NoError(t, err)
-	require.Equal(t, postings.ID(103), max)
-
-	d = NewPostingsList()
-	_, err = d.Max()
-	require.Error(t, err)
-}
-
 func TestRoaringPostingsListInsert(t *testing.T) {
 	d := NewPostingsList()
 	require.NoError(t, d.Insert(1))
diff --git a/src/m3ninx/postings/types.go b/src/m3ninx/postings/types.go
index cc651bdde9..df8d05f88e 100644
--- a/src/m3ninx/postings/types.go
+++ b/src/m3ninx/postings/types.go
@@ -44,7 +44,7 @@ var (
 
 // List is a collection of docIDs. The interface only supports immutable methods.
 type List interface {
-	// Contains returns whether the specified ID is contained in this postings list.
+	// Contains returns whether an ID is contained or not.
 	Contains(id ID) bool
 
 	// IsEmpty returns whether the postings list is empty. Some posting lists have an
@@ -58,9 +58,6 @@ type List interface {
 	// Iterator returns an iterator over the IDs in the postings list.
 	Iterator() Iterator
 
-	// Clone returns a copy of the postings list.
-	Clone() MutableList
-
 	// Equal returns whether this postings list contains the same posting IDs as other.
 	Equal(other List) bool
 }
@@ -97,6 +94,9 @@ type MutableList interface {
 	// RemoveRange removes all IDs between [min, max) from this postings list.
 	RemoveRange(min, max ID) error
 
+	// Clone returns a copy of the postings list.
+	Clone() MutableList
+
 	// Reset resets the internal state of the postings list.
 	Reset()
 }
diff --git a/src/x/cache/lru_cache.go b/src/x/cache/lru_cache.go
index 6a20e1b722..b5ef3f9f18 100644
--- a/src/x/cache/lru_cache.go
+++ b/src/x/cache/lru_cache.go
@@ -227,7 +227,7 @@ func (c *LRU) PutWithTTL(key string, value interface{}, ttl time.Duration) {
 	c.mut.Lock()
 	defer c.mut.Unlock()
 
-	_, _ = c.updateCacheEntry(key, expiresAt, value, nil)
+	_, _ = c.updateCacheEntryWithLock(key, expiresAt, value, nil)
 }
 
 // Get returns the value associated with the key, optionally
@@ -349,22 +349,22 @@ func (c *LRU) cacheLoadComplete(
 	defer c.mut.Unlock()
 
 	if err != nil {
-		return c.handleCacheLoadError(key, expiresAt, err)
+		return c.handleCacheLoadErrorWithLock(key, expiresAt, err)
 	}
 
-	return c.updateCacheEntry(key, expiresAt, value, err)
+	return c.updateCacheEntryWithLock(key, expiresAt, value, err)
 }
 
-// handleCacheLoadError handles the results of an error from a cache load. If
+// handleCacheLoadErrorWithLock handles the results of an error from a cache load. If
 // we are caching errors, updates the cache entry with the error. Otherwise
 // removes the cache entry and returns the (possible unwrapped) error.
-func (c *LRU) handleCacheLoadError(
+func (c *LRU) handleCacheLoadErrorWithLock(
 	key string, expiresAt time.Time, err error,
 ) (interface{}, error) {
 	// If the loader is telling us to cache this error, do so unconditionally
 	var cachedErr CachedError
 	if errors.As(err, &cachedErr) {
-		return c.updateCacheEntry(key, expiresAt, nil, cachedErr.Err)
+		return c.updateCacheEntryWithLock(key, expiresAt, nil, cachedErr.Err)
 	}
 
 	// If the cache is configured to cache errors by default, do so unless
@@ -372,7 +372,7 @@ func (c *LRU) handleCacheLoadError(
 	var uncachedErr UncachedError
 	isUncachedError := errors.As(err, &uncachedErr)
 	if c.cacheErrors && !isUncachedError {
-		return c.updateCacheEntry(key, expiresAt, nil, err)
+		return c.updateCacheEntryWithLock(key, expiresAt, nil, err)
 	}
 
 	// Something happened during load, but we don't want to cache this - remove the entry,
@@ -389,12 +389,17 @@ func (c *LRU) handleCacheLoadError(
 	return nil, err
 }
 
-// updateCacheEntry updates a cache entry with a new value or cached error,
+// updateCacheEntryWithLock updates a cache entry with a new value or cached error,
 // and marks it as the most recently accessed and most recently loaded entry
-func (c *LRU) updateCacheEntry(
+func (c *LRU) updateCacheEntryWithLock(
 	key string, expiresAt time.Time, value interface{}, err error,
 ) (interface{}, error) {
 	entry := c.entries[key]
+	if entry == nil {
+		entry = &lruCacheEntry{}
+		c.entries[key] = entry
+	}
+
 	entry.value, entry.err = value, err
 
 	// Re-adjust expiration and mark as both most recently access and most recently used
@@ -408,8 +413,10 @@ func (c *LRU) updateCacheEntry(
 	c.metrics.entries.Update(float64(len(c.entries)))
 
 	// Tell any other callers that we're done loading
-	close(entry.loadingCh)
-	entry.loadingCh = nil
+	if entry.loadingCh != nil {
+		close(entry.loadingCh)
+		entry.loadingCh = nil
+	}
 	return value, err
 }
 
diff --git a/src/x/cache/lru_cache_test.go b/src/x/cache/lru_cache_test.go
index d2ef4eb463..82416f5db5 100644
--- a/src/x/cache/lru_cache_test.go
+++ b/src/x/cache/lru_cache_test.go
@@ -570,6 +570,16 @@ func TestLRU_GetWithTTL_AllowEntrySpecificTTLs(t *testing.T) {
 	assert.Equal(t, 2, loadAttempts)
 }
 
+func TestLRU_PutWithTTL_NoExistingEntry(t *testing.T) {
+	lru := NewLRU(nil)
+
+	lru.PutWithTTL("foo", "bar", 0)
+
+	value, err := lru.GetWithTTL(context.Background(), "foo", nil)
+	require.NoError(t, err)
+	assert.Equal(t, "bar", value.(string))
+}
+
 var defaultKeys = []string{
 	"key-0", "key-1", "key-2", "key-3", "key-4", "key-5", "key-6", "key-7", "key-8", "key-9", "key10",
 }

From a0f18015d4166429b090f7242627fcc22d3ef914 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 26 Oct 2020 06:21:00 -0400
Subject: [PATCH 005/106] Implement read only bitmap for close to zero alloc
 operations on postings lists

---
 src/dbnode/server/server.go                   |   1 -
 .../storage/index/fields_terms_iterator.go    |  19 +-
 .../storage/index/read_through_segment.go     |   2 +-
 src/m3ninx/index/segment/builder/terms.go     |   2 +-
 src/m3ninx/index/segment/fst/options.go       |  18 -
 src/m3ninx/index/segment/fst/segment.go       |  39 +-
 src/m3ninx/index/segment/mem/reader.go        |   2 +-
 src/m3ninx/index/types.go                     |   2 +-
 src/m3ninx/postings/compare.go                |  83 ++
 .../postings/roaring/bitmap_readonly.go       | 432 ++++++++++-
 .../postings/roaring/bitmap_readonly_multi.go | 730 ++++++++++++++++++
 src/m3ninx/postings/roaring/roaring.go        |  25 +-
 src/m3ninx/postings/types.go                  |  10 +-
 src/m3ninx/search/executor/iterator.go        |  13 +-
 src/m3ninx/search/searcher/all.go             |   8 +-
 src/m3ninx/search/searcher/conjunction.go     |  23 +-
 src/m3ninx/search/searcher/disjunction.go     |  20 +-
 src/m3ninx/search/searcher/empty.go           |   9 +-
 src/m3ninx/search/searcher/field.go           |   8 +-
 .../search/searcher/lazy_postings_list.go     | 100 ---
 src/m3ninx/search/searcher/negation.go        |  16 +-
 src/m3ninx/search/searcher/regexp.go          |   8 +-
 src/m3ninx/search/searcher/term.go            |   8 +-
 src/m3ninx/search/types.go                    |   2 +-
 24 files changed, 1314 insertions(+), 266 deletions(-)
 create mode 100644 src/m3ninx/postings/compare.go
 create mode 100644 src/m3ninx/postings/roaring/bitmap_readonly_multi.go
 delete mode 100644 src/m3ninx/search/searcher/lazy_postings_list.go

diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index 152e81eb6d..59d73755b4 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -1672,7 +1672,6 @@ func withEncodingAndPoolingOptions(
 				SetInstrumentOptions(iopts)).
 		SetFSTSegmentOptions(
 			opts.IndexOptions().FSTSegmentOptions().
-				SetPostingsListPool(postingsList).
 				SetInstrumentOptions(iopts).
 				SetContextPool(opts.ContextPool())).
 		SetSegmentBuilderOptions(
diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index 94bc5cac33..e13d8187ff 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -22,7 +22,6 @@ package index
 
 import (
 	"errors"
-	"fmt"
 
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
@@ -123,24 +122,13 @@ func (fti *fieldsAndTermsIter) Reset(reader segment.Reader, opts fieldsAndTermsI
 		return err
 	}
 
-	pl, iter, err := searcher.Search(fti.reader)
+	pl, err := searcher.Search(fti.reader)
 	if err != nil {
 		return err
 	}
 
-	if pl == nil && iter != nil {
-		// Turn iterator into a postings list.
-		mutable := roaring.NewPostingsList()
-		if err := mutable.AddIterator(iter); err != nil {
-			return err
-		}
-		pl = mutable
-	}
-	if pl == nil {
-		return fmt.Errorf("no postings list or iterator returned")
-	}
-
 	// Hold onto the postings bitmap to intersect against on a per term basis.
+	// TODO: This will be a read only bitmap, need to update.
 	bitmap, ok := roaring.BitmapFromPostingsList(pl)
 	if !ok {
 		return errUnpackBitmapFromPostingsList
@@ -221,12 +209,13 @@ func (fti *fieldsAndTermsIter) nextTermsIterResult() (bool, error) {
 			return true, nil
 		}
 
+		// TODO: This will be a read only bitmap, need to update.
 		bitmap, ok := roaring.BitmapFromPostingsList(fti.current.postings)
 		if !ok {
 			return false, errUnpackBitmapFromPostingsList
 		}
 
-		// Check term isn't part of at least some of the documents we're
+		// Check term isn part of at least some of the documents we're
 		// restricted to providing results for based on intersection
 		// count.
 		// Note: IntersectionCount is significantly faster than intersecting and
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index 52aaaf6251..76665cef34 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -253,7 +253,7 @@ func (s *readThroughSegmentReader) MatchField(field []byte) (postings.List, erro
 // NB(r): The postings list returned by match all is just an iterator
 // from zero to the maximum document number indexed by the segment and as such
 // causes no allocations to compute and construct.
-func (s *readThroughSegmentReader) MatchAll() (postings.MutableList, error) {
+func (s *readThroughSegmentReader) MatchAll() (postings.List, error) {
 	return s.reader.MatchAll()
 }
 
diff --git a/src/m3ninx/index/segment/builder/terms.go b/src/m3ninx/index/segment/builder/terms.go
index 4fde080672..0a44c4ac04 100644
--- a/src/m3ninx/index/segment/builder/terms.go
+++ b/src/m3ninx/index/segment/builder/terms.go
@@ -68,7 +68,7 @@ func (t *terms) post(term []byte, id postings.ID) error {
 
 	// If empty posting list, track insertion of this key into the terms
 	// collection for correct response when retrieving all terms
-	newTerm := postingsList.Len() == 0
+	newTerm := postingsList.IsEmpty()
 	if err := postingsList.Insert(id); err != nil {
 		return err
 	}
diff --git a/src/m3ninx/index/segment/fst/options.go b/src/m3ninx/index/segment/fst/options.go
index 275cd75922..bb87bff757 100644
--- a/src/m3ninx/index/segment/fst/options.go
+++ b/src/m3ninx/index/segment/fst/options.go
@@ -22,7 +22,6 @@ package fst
 
 import (
 	"github.com/m3db/m3/src/m3ninx/postings"
-	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/x/bytes"
 	"github.com/m3db/m3/src/x/context"
 	"github.com/m3db/m3/src/x/instrument"
@@ -47,12 +46,6 @@ type Options interface {
 	// BytesPool returns the bytes pool.
 	BytesPool() pool.BytesPool
 
-	// SetPostingsListPool sets the postings list pool.
-	SetPostingsListPool(value postings.Pool) Options
-
-	// PostingsListPool returns the postings list pool.
-	PostingsListPool() postings.Pool
-
 	// SetContextPool sets the contextPool.
 	SetContextPool(value context.Pool) Options
 
@@ -85,7 +78,6 @@ func NewOptions() Options {
 		iopts:             instrument.NewOptions(),
 		bytesSliceArrPool: arrPool,
 		bytesPool:         bytesPool,
-		postingsPool:      postings.NewPool(nil, roaring.NewPostingsList),
 		// Use a zero pool, this should be overriden at config time.
 		contextPool: context.NewPool(context.NewOptions().
 			SetContextPoolOptions(pool.NewObjectPoolOptions().SetSize(0)).
@@ -113,16 +105,6 @@ func (o *opts) BytesPool() pool.BytesPool {
 	return o.bytesPool
 }
 
-func (o *opts) SetPostingsListPool(v postings.Pool) Options {
-	opts := *o
-	opts.postingsPool = v
-	return &opts
-}
-
-func (o *opts) PostingsListPool() postings.Pool {
-	return o.postingsPool
-}
-
 func (o *opts) SetContextPool(value context.Pool) Options {
 	opts := *o
 	opts.contextPool = value
diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index ea0430b937..4f71ad496d 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -418,7 +418,10 @@ func (r *fsSegment) matchFieldNotClosedMaybeFinalizedWithRLock(
 	}
 	if !exists {
 		// i.e. we don't know anything about the term, so can early return an empty postings list
-		return r.opts.PostingsListPool().Get(), nil
+		// NB(r): Important this is a read only bitmap since we perform
+		// operations on postings lists and expect them all to be read only
+		// postings lists.
+		return roaring.NewReadOnlyBitmap(nil)
 	}
 
 	protoBytes, _, err := r.retrieveTermsBytesWithRLock(r.data.FSTTermsData.Bytes, termsFSTOffset)
@@ -451,7 +454,10 @@ func (r *fsSegment) matchTermNotClosedMaybeFinalizedWithRLock(
 
 	if !exists {
 		// i.e. we don't know anything about the field, so can early return an empty postings list
-		return r.opts.PostingsListPool().Get(), nil
+		// NB(r): Important this is a read only bitmap since we perform
+		// operations on postings lists and expect them all to be read only
+		// postings lists.
+		return roaring.NewReadOnlyBitmap(nil)
 	}
 
 	fstCloser := x.NewSafeCloser(termsFST)
@@ -464,7 +470,10 @@ func (r *fsSegment) matchTermNotClosedMaybeFinalizedWithRLock(
 
 	if !exists {
 		// i.e. we don't know anything about the term, so can early return an empty postings list
-		return r.opts.PostingsListPool().Get(), nil
+		// NB(r): Important this is a read only bitmap since we perform
+		// operations on postings lists and expect them all to be read only
+		// postings lists.
+		return roaring.NewReadOnlyBitmap(nil)
 	}
 
 	pl, err := r.retrievePostingsListWithRLock(postingsOffset)
@@ -501,7 +510,10 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 
 	if !exists {
 		// i.e. we don't know anything about the field, so can early return an empty postings list
-		return r.opts.PostingsListPool().Get(), nil
+		// NB(r): Important this is a read only bitmap since we perform
+		// operations on postings lists and expect them all to be read only
+		// postings lists.
+		return roaring.NewReadOnlyBitmap(nil)
 	}
 
 	var (
@@ -534,7 +546,9 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 		iterErr = iter.Next()
 	}
 
-	pl, err := roaring.Union(pls)
+	// NB(r): Can use union read only since we are guaranteed all
+	// postings lists are read only.
+	pl, err := roaring.UnionReadOnly(pls)
 	if err != nil {
 		return nil, err
 	}
@@ -550,20 +564,17 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 	return pl, nil
 }
 
-func (r *fsSegment) matchAllNotClosedMaybeFinalizedWithRLock() (postings.MutableList, error) {
+func (r *fsSegment) matchAllNotClosedMaybeFinalizedWithRLock() (postings.List, error) {
 	// NB(r): Not closed, but could be finalized (i.e. closed segment reader)
 	// calling match field after this segment is finalized.
 	if r.finalized {
 		return nil, errReaderFinalized
 	}
 
-	pl := r.opts.PostingsListPool().Get()
-	err := pl.AddRange(0, postings.ID(r.numDocs))
-	if err != nil {
-		return nil, err
-	}
-
-	return pl, nil
+	// NB(r): Important this is a read only bitmap since we perform
+	// operations on postings lists and expect them all to be read only
+	// postings lists.
+	return roaring.NewReadOnlyBitmapRange(0, uint64(r.numDocs))
 }
 
 func (r *fsSegment) docNotClosedMaybeFinalizedWithRLock(id postings.ID) (doc.Document, error) {
@@ -871,7 +882,7 @@ func (sr *fsSegmentReader) MatchRegexp(
 	return pl, err
 }
 
-func (sr *fsSegmentReader) MatchAll() (postings.MutableList, error) {
+func (sr *fsSegmentReader) MatchAll() (postings.List, error) {
 	if sr.closed {
 		return nil, errReaderClosed
 	}
diff --git a/src/m3ninx/index/segment/mem/reader.go b/src/m3ninx/index/segment/mem/reader.go
index 385bc24e4a..c029181efb 100644
--- a/src/m3ninx/index/segment/mem/reader.go
+++ b/src/m3ninx/index/segment/mem/reader.go
@@ -109,7 +109,7 @@ func (r *reader) MatchRegexp(field []byte, compiled index.CompiledRegex) (postin
 	return r.segment.matchRegexp(field, compileRE)
 }
 
-func (r *reader) MatchAll() (postings.MutableList, error) {
+func (r *reader) MatchAll() (postings.List, error) {
 	r.RLock()
 	defer r.RUnlock()
 	if r.closed {
diff --git a/src/m3ninx/index/types.go b/src/m3ninx/index/types.go
index 99a141fba6..7dec18a368 100644
--- a/src/m3ninx/index/types.go
+++ b/src/m3ninx/index/types.go
@@ -74,7 +74,7 @@ type Readable interface {
 	MatchRegexp(field []byte, c CompiledRegex) (postings.List, error)
 
 	// MatchAll returns a postings list for all documents known to the Reader.
-	MatchAll() (postings.MutableList, error)
+	MatchAll() (postings.List, error)
 
 	// Docs returns an iterator over the documents whose IDs are in the provided
 	// postings list.
diff --git a/src/m3ninx/postings/compare.go b/src/m3ninx/postings/compare.go
new file mode 100644
index 0000000000..ceb9329e60
--- /dev/null
+++ b/src/m3ninx/postings/compare.go
@@ -0,0 +1,83 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package postings
+
+// Equal compares two postings lists for equality.
+func Equal(a, b List) bool {
+	countA, okA := a.CountFast()
+	countB, okB := b.CountFast()
+	if okA && okB && countA != countB {
+		return false
+	}
+
+	iter := a.Iterator()
+	otherIter := b.Iterator()
+
+	closed := false
+	defer func() {
+		if !closed {
+			_ = iter.Err()
+			_ = iter.Close()
+			_ = otherIter.Err()
+			_ = otherIter.Close()
+		}
+	}()
+
+	for iter.Next() {
+		if !otherIter.Next() {
+			return false
+		}
+		if iter.Current() != otherIter.Current() {
+			return false
+		}
+	}
+
+	if otherIter.Next() {
+		// Other iterator still had values.
+		return false
+	}
+
+	closed = true
+	iterErr := iter.Err()
+	iterClose := iter.Close()
+	otherIterErr := otherIter.Err()
+	otherIterClose := otherIter.Close()
+	return iterErr == nil &&
+		iterClose == nil &&
+		otherIterErr == nil &&
+		otherIterClose == nil
+}
+
+// CountSlow returns the count of postings list values by iterating.
+func CountSlow(a List) int {
+	count := 0
+	iter := a.Iterator()
+	for iter.Next() {
+		count++
+	}
+	if err := iter.Err(); err != nil {
+		return 0
+	}
+	if err := iter.Close(); err != nil {
+		return 0
+	}
+	return count
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index 50bd077ffb..f3e1989d1a 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -32,14 +32,17 @@ import (
 	"github.com/m3db/m3/src/m3ninx/postings"
 )
 
-var (
-	errNotPilosaRoaring = errors.New("not pilosa roaring format")
-
+const (
 	headerBaseSize     = uint64(8)
 	magicNumber        = uint32(12348)
 	storageVersion     = uint32(0)
 	bitmapN            = 1024
 	runCountHeaderSize = uint32(2)
+	containerValues    = 2 << 15 // 2^16 or 65k
+)
+
+var (
+	errNotPilosaRoaring = errors.New("not pilosa roaring format")
 )
 
 type containerType byte
@@ -61,6 +64,10 @@ var _ postings.List = (*ReadOnlyBitmap)(nil)
 type ReadOnlyBitmap struct {
 	data []byte
 	keyN uint64
+
+	rangeOverride       bool // if rangeOverride then just a read only range
+	rangeStartInclusive uint64
+	rangeEndExclusive   uint64
 }
 
 // NewReadOnlyBitmap returns a new read only bitmap.
@@ -72,7 +79,28 @@ func NewReadOnlyBitmap(data []byte) (*ReadOnlyBitmap, error) {
 	return b, nil
 }
 
+// NewReadOnlyBitmapRange returns a special read only bitmap that
+// represents a range.
+func NewReadOnlyBitmapRange(
+	startInclusive, endExclusive uint64,
+) (*ReadOnlyBitmap, error) {
+	if endExclusive < startInclusive {
+		return nil, fmt.Errorf("end cannot be before start: start=%d, end=%d",
+			startInclusive, endExclusive)
+	}
+
+	return &ReadOnlyBitmap{
+		rangeOverride:       true,
+		rangeStartInclusive: startInclusive,
+		rangeEndExclusive:   endExclusive,
+	}, nil
+}
+
 func (b *ReadOnlyBitmap) Reset(data []byte) error {
+	b.rangeOverride = false
+	b.rangeStartInclusive = 0
+	b.rangeEndExclusive = 0
+
 	if len(data) == 0 {
 		// Reset to nil
 		b.data = nil
@@ -209,6 +237,12 @@ func (b *ReadOnlyBitmap) containerAtIndex(index uint64) readOnlyContainer {
 }
 
 func (b *ReadOnlyBitmap) Contains(id postings.ID) bool {
+	if b.rangeOverride {
+		// Using range override.
+		return uint64(id) >= b.rangeStartInclusive &&
+			uint64(id) < b.rangeEndExclusive
+	}
+
 	value := uint64(id)
 	container, ok := b.container(highbits(value))
 	if !ok {
@@ -227,10 +261,13 @@ func (b *ReadOnlyBitmap) Contains(id postings.ID) bool {
 }
 
 func (b *ReadOnlyBitmap) IsEmpty() bool {
-	return b.keyN == 0
+	if b.rangeOverride {
+		return b.rangeCount() == 0
+	}
+	return b.count() == 0
 }
 
-func (b *ReadOnlyBitmap) Len() int {
+func (b *ReadOnlyBitmap) count() int {
 	l := 0
 	for i := uint64(0); i < b.keyN; i++ {
 		l += int(b.containerAtIndex(i).cardinality)
@@ -238,25 +275,42 @@ func (b *ReadOnlyBitmap) Len() int {
 	return l
 }
 
+func (b *ReadOnlyBitmap) rangeCount() int {
+	return int(b.rangeEndExclusive - b.rangeStartInclusive)
+}
+
+func (b *ReadOnlyBitmap) CountFast() (int, bool) {
+	if b.rangeOverride {
+		return b.rangeCount(), true
+	}
+	return b.count(), true
+}
+
+func (b *ReadOnlyBitmap) CountSlow() int {
+	if b.rangeOverride {
+		return b.rangeCount()
+	}
+	return b.count()
+}
+
 func (b *ReadOnlyBitmap) Iterator() postings.Iterator {
+	if b.rangeOverride {
+		return postings.NewRangeIterator(postings.ID(b.rangeStartInclusive),
+			postings.ID(b.rangeEndExclusive))
+	}
 	return newReadOnlyBitmapIterator(b)
 }
 
-func (b *ReadOnlyBitmap) Equal(other postings.List) bool {
-	if b.Len() != other.Len() {
-		return false
+func (b *ReadOnlyBitmap) containerIterator() containerIterator {
+	if b.rangeOverride {
+		return newReadOnlyBitmapRangeContainerIterator(b.rangeStartInclusive,
+			b.rangeEndExclusive)
 	}
-	iter := b.Iterator()
-	otherIter := other.Iterator()
-	for iter.Next() {
-		if !otherIter.Next() {
-			return false
-		}
-		if iter.Current() != otherIter.Current() {
-			return false
-		}
-	}
-	return true
+	return newReadOnlyBitmapContainerIterator(b)
+}
+
+func (b *ReadOnlyBitmap) Equal(other postings.List) bool {
+	return postings.Equal(b, other)
 }
 
 func (b *ReadOnlyBitmap) keyAtIndex(index int) uint64 {
@@ -430,3 +484,343 @@ func (i *readOnlyBitmapIterator) Err() error {
 func (i *readOnlyBitmapIterator) Close() error {
 	return nil
 }
+
+var _ containerIterator = (*readOnlyBitmapContainerIterator)(nil)
+
+type readOnlyBitmapContainerIterator struct {
+	b              *ReadOnlyBitmap
+	containerIndex int
+	container      readOnlyContainer
+}
+
+func newReadOnlyBitmapContainerIterator(
+	b *ReadOnlyBitmap,
+) *readOnlyBitmapContainerIterator {
+	return &readOnlyBitmapContainerIterator{
+		b:              b,
+		containerIndex: -1,
+	}
+}
+
+func (i *readOnlyBitmapContainerIterator) NextContainer() bool {
+	i.containerIndex++
+	if i.containerIndex >= int(i.b.keyN) {
+		return false
+	}
+	i.container = i.b.containerAtIndex(uint64(i.containerIndex))
+	return true
+}
+
+func (i *readOnlyBitmapContainerIterator) ContainerKey() uint64 {
+	return i.container.key
+}
+
+func (i *readOnlyBitmapContainerIterator) ContainerUnion(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	if bitmap, ok := i.container.bitmap(); ok {
+		if ctx.siblings == 0 {
+			// Special case, if no other containers at same key and this is a
+			// bitmap container then we can just immediately iterate over that
+			// no copying required.
+			target.SetReadOnly(bitmap.values)
+			return
+		}
+
+		unionBitmapInPlace(target.bitmap, bitmap.values)
+		return
+	}
+
+	if array, ok := i.container.array(); ok {
+		// Blindly set array values.
+		for _, v := range array.values {
+			target.bitmap[v>>6] |= (uint64(1) << (v % 64))
+		}
+		return
+	}
+
+	if runs, ok := i.container.runs(); ok {
+		// Blindly set run ranges.
+		for i := 0; i < len(runs.values); i++ {
+			bitmapSetRange(target.bitmap,
+				uint64(runs.values[i].start), uint64(runs.values[i].last)+1)
+		}
+		return
+	}
+}
+
+func (i *readOnlyBitmapContainerIterator) ContainerIntersect(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	if bitmap, ok := i.container.bitmap(); ok {
+		if ctx.siblings == 0 {
+			// Special case, if no other containers at same key and this is a
+			// bitmap container then we can just immediately iterate over that
+			// no copying required.
+			target.SetReadOnly(bitmap.values)
+			return
+		}
+
+		intersectBitmapInPlace(target.bitmap, bitmap.values)
+		return
+	}
+
+	if array, ok := i.container.array(); ok {
+		// Set temp bitmap with the array values then intersect.
+		ctx.tempBitmap.Reset(false)
+		for _, v := range array.values {
+			ctx.tempBitmap.bitmap[v>>6] |= (uint64(1) << (v % 64))
+		}
+
+		intersectBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
+		return
+	}
+
+	if runs, ok := i.container.runs(); ok {
+		// Set temp bitmap with the ranges then intersect with temp.
+		ctx.tempBitmap.Reset(false)
+		for i := 0; i < len(runs.values); i++ {
+			bitmapSetRange(ctx.tempBitmap.bitmap,
+				uint64(runs.values[i].start), uint64(runs.values[i].last)+1)
+		}
+
+		intersectBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
+		return
+	}
+}
+
+func (i *readOnlyBitmapContainerIterator) ContainerNegate(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	if bitmap, ok := i.container.bitmap(); ok {
+		differenceBitmapInPlace(target.bitmap, bitmap.values)
+		return
+	}
+
+	if array, ok := i.container.array(); ok {
+		// Set temp bitmap with the array values then intersect.
+		ctx.tempBitmap.Reset(false)
+		for _, v := range array.values {
+			ctx.tempBitmap.bitmap[v>>6] |= (uint64(1) << (v % 64))
+		}
+
+		differenceBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
+		return
+	}
+
+	if runs, ok := i.container.runs(); ok {
+		// Set temp bitmap with the ranges then intersect with temp.
+		ctx.tempBitmap.Reset(false)
+		for i := 0; i < len(runs.values); i++ {
+			bitmapSetRange(ctx.tempBitmap.bitmap,
+				uint64(runs.values[i].start), uint64(runs.values[i].last)+1)
+		}
+
+		differenceBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
+		return
+	}
+}
+
+const maxBitmap = 0xFFFFFFFFFFFFFFFF
+
+// bitmapSetRange sets all bits in [i, j) the same as pilosa's
+// bitmapSetRangeIgnoreN.
+// pilosa license is included as part of vendor code install.
+func bitmapSetRange(bitmap []uint64, i, j uint64) {
+	x := i >> 6
+	y := (j - 1) >> 6
+	var X uint64 = maxBitmap << (i % 64)
+	var Y uint64 = maxBitmap >> (63 - ((j - 1) % 64))
+
+	if x == y {
+		bitmap[x] |= (X & Y)
+	} else {
+		bitmap[x] |= X
+		for i := x + 1; i < y; i++ {
+			bitmap[i] = maxBitmap
+		}
+		bitmap[y] |= Y
+	}
+}
+
+// bitmapContains returns if bitmap includes element the same as pilosa's
+// bitmapContains.
+// pilosa license is included as part of vendor code install.
+func bitmapContains(bitmap []uint64, v uint16) bool {
+	return (bitmap[v/64] & (1 << uint64(v%64))) != 0
+}
+
+func unionBitmapInPlace(a, b []uint64) {
+	// Below is similar to pilosa's unionBitmapInPlace.
+	// pilosa license is included as part of vendor code install.
+	// local variables added to prevent BCE checks in loop
+	// see https://go101.org/article/bounds-check-elimination.html
+	var (
+		ab = a[:bitmapN]
+		bb = b[:bitmapN]
+	)
+
+	// Manually unroll loop to make it a little faster.
+	for i := 0; i < bitmapN; i += 4 {
+		ab[i] |= bb[i]
+		ab[i+1] |= bb[i+1]
+		ab[i+2] |= bb[i+2]
+		ab[i+3] |= bb[i+3]
+	}
+}
+
+func intersectBitmapInPlace(a, b []uint64) {
+	// Below is similar to pilosa's unionBitmapInPlace.
+	// pilosa license is included as part of vendor code install.
+	// local variables added to prevent BCE checks in loop
+	// see https://go101.org/article/bounds-check-elimination.html
+	var (
+		ab = a[:bitmapN]
+		bb = b[:bitmapN]
+	)
+
+	// Manually unroll loop to make it a little faster.
+	for i := 0; i < bitmapN; i += 4 {
+		ab[i] &= bb[i]
+		ab[i+1] &= bb[i+1]
+		ab[i+2] &= bb[i+2]
+		ab[i+3] &= bb[i+3]
+	}
+}
+
+func differenceBitmapInPlace(a, b []uint64) {
+	// Below is similar to pilosa's unionBitmapInPlace.
+	// pilosa license is included as part of vendor code install.
+	// local variables added to prevent BCE checks in loop
+	// see https://go101.org/article/bounds-check-elimination.html
+	var (
+		ab = a[:bitmapN]
+		bb = b[:bitmapN]
+	)
+
+	// Manually unroll loop to make it a little faster.
+	for i := 0; i < bitmapN; i += 4 {
+		ab[i] &= (^bb[i])
+		ab[i+1] &= (^bb[i+1])
+		ab[i+2] &= (^bb[i+2])
+		ab[i+3] &= (^bb[i+3])
+	}
+}
+
+var _ containerIterator = (*readOnlyBitmapRangeContainerIterator)(nil)
+
+type readOnlyBitmapRangeContainerIterator struct {
+	startInclusive int64 // use int64 so endInclusive can be -1 if need be
+	endInclusive   int64 // use int64 so endInclusive can be -1 if need be
+	first          bool
+	key            int64
+}
+
+func newReadOnlyBitmapRangeContainerIterator(
+	startInclusive, endExclusive uint64,
+) *readOnlyBitmapRangeContainerIterator {
+	return &readOnlyBitmapRangeContainerIterator{
+		startInclusive: int64(startInclusive),
+		endInclusive:   int64(endExclusive - 1),
+	}
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) startInKey() bool {
+	return i.key == i.startInclusive/containerValues
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) endInKey() bool {
+	return i.key == i.endInclusive/containerValues
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) validKey() bool {
+	return i.key <= i.endInclusive/containerValues
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) keyStart() uint64 {
+	return uint64(i.key) << 16
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) keyEnd() uint64 {
+	return uint64(i.key+1) << 16
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) NextContainer() bool {
+	if !i.first {
+		i.first = true
+		i.key = i.startInclusive / containerValues
+		return i.validKey()
+	}
+
+	if !i.validKey() {
+		return false
+	}
+
+	i.key++
+	return i.validKey()
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) ContainerKey() uint64 {
+	return uint64(i.key)
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) ContainerUnion(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	start := i.keyStart()
+	if i.startInKey() {
+		start = uint64(i.startInclusive)
+	}
+
+	end := i.keyEnd()
+	if i.endInKey() {
+		end = uint64(i.endInclusive)
+	}
+
+	// Set from [start, end+1) to union.
+	bitmapSetRange(target.bitmap, start, end+1)
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) ContainerIntersect(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	start := i.keyStart()
+	if i.startInKey() {
+		start = uint64(i.startInclusive)
+	}
+
+	end := i.keyEnd()
+	if i.endInKey() {
+		end = uint64(i.endInclusive)
+	}
+
+	// Create temp overlay and intersect with that.
+	ctx.tempBitmap.Reset(false)
+	bitmapSetRange(ctx.tempBitmap.bitmap, start, end+1)
+	intersectBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) ContainerNegate(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	start := i.keyStart()
+	if i.startInKey() {
+		start = uint64(i.startInclusive)
+	}
+
+	end := i.keyEnd()
+	if i.endInKey() {
+		end = uint64(i.endInclusive)
+	}
+
+	// Create temp overlay and intersect with that.
+	ctx.tempBitmap.Reset(false)
+	bitmapSetRange(ctx.tempBitmap.bitmap, start, end+1)
+	differenceBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly_multi.go b/src/m3ninx/postings/roaring/bitmap_readonly_multi.go
new file mode 100644
index 0000000000..84bcd30c37
--- /dev/null
+++ b/src/m3ninx/postings/roaring/bitmap_readonly_multi.go
@@ -0,0 +1,730 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package roaring
+
+import (
+	"errors"
+	"fmt"
+	"math/bits"
+	"sync"
+
+	"github.com/m3db/m3/src/m3ninx/postings"
+)
+
+var (
+	// ErrNotReadOnlyBitmaps returned from operations that expect read only bitmaps.
+	ErrNotReadOnlyBitmaps = errors.New("not read only bitmaps")
+)
+
+// UnionReadOnly expects postings lists to all be read only.
+func UnionReadOnly(unions []postings.List) (postings.List, error) {
+	union := make([]multiBitmapIterable, 0, len(unions))
+	for _, elem := range unions {
+		b, ok := elem.(*ReadOnlyBitmap)
+		if ok {
+			union = append(union, multiBitmapIterable{bitmap: b})
+			continue
+		}
+
+		mb, ok := elem.(*multiBitmap)
+		if !ok {
+			union = append(union, multiBitmapIterable{multiBitmap: mb})
+		}
+
+		return nil, ErrNotReadOnlyBitmaps
+	}
+
+	return newMultiBitmap(multiBitmapOptions{
+		op:    multiBitmapOpUnion,
+		union: union,
+	})
+}
+
+// IntersectAndNegateReadOnly expects postings lists to all be read only.
+func IntersectAndNegateReadOnly(
+	intersects []postings.List,
+	negates []postings.List,
+) (postings.List, error) {
+	intersect := make([]multiBitmapIterable, 0, len(intersects))
+	for _, elem := range intersects {
+		b, ok := elem.(*ReadOnlyBitmap)
+		if ok {
+			intersect = append(intersect, multiBitmapIterable{bitmap: b})
+			continue
+		}
+
+		mb, ok := elem.(*multiBitmap)
+		if !ok {
+			intersect = append(intersect, multiBitmapIterable{multiBitmap: mb})
+		}
+
+		return nil, ErrNotReadOnlyBitmaps
+	}
+
+	negate := make([]multiBitmapIterable, 0, len(negates))
+	for _, elem := range negates {
+		b, ok := elem.(*ReadOnlyBitmap)
+		if ok {
+			negate = append(negate, multiBitmapIterable{bitmap: b})
+			continue
+		}
+
+		mb, ok := elem.(*multiBitmap)
+		if !ok {
+			negate = append(negate, multiBitmapIterable{multiBitmap: mb})
+		}
+
+		return nil, ErrNotReadOnlyBitmaps
+	}
+
+	return newMultiBitmap(multiBitmapOptions{
+		op:              multiBitmapOpIntersect,
+		intersect:       intersect,
+		intersectNegate: negate,
+	})
+}
+
+var _ postings.List = (*multiBitmap)(nil)
+
+type multiBitmapOp uint8
+
+const (
+	multiBitmapOpUnknown multiBitmapOp = iota
+
+	// Place valid values between unknown and terminator
+	multiBitmapOpUnion
+	multiBitmapOpIntersect
+
+	multiBitmapOpInvalidLast
+)
+
+// validateMultiBitmapOp can do fast validation because it's a range check.
+func (op multiBitmapOp) validate() error {
+	// Fast validation
+	if op > multiBitmapOpUnknown && op < multiBitmapOpInvalidLast {
+		return nil
+	}
+	return fmt.Errorf("invalid multi-iter op: %d", op)
+}
+
+// multiBitmap is a tree like iterator.
+type multiBitmap struct {
+	multiBitmapOptions
+}
+
+// multiBitmapIterable either contains a bitmap or another multi-iter.
+type multiBitmapIterable struct {
+	multiBitmap *multiBitmap
+	bitmap      *ReadOnlyBitmap
+}
+
+func (i multiBitmapIterable) Contains(id postings.ID) bool {
+	if i.multiBitmap != nil {
+		return i.multiBitmap.Contains(id)
+	}
+	return i.bitmap.Contains(id)
+}
+
+type multiBitmapOptions struct {
+	op multiBitmapOp
+
+	// union is valid when multiBitmapOpUnion, no other options valid.
+	union []multiBitmapIterable
+
+	// intersect is valid when multiBitmapOpIntersect used.
+	intersect []multiBitmapIterable
+	// intersectNegate is valid when multiBitmapOpIntersect used.
+	intersectNegate []multiBitmapIterable
+}
+
+func (o multiBitmapOptions) validate() error {
+	return o.op.validate()
+}
+
+func newMultiBitmap(opts multiBitmapOptions) (*multiBitmap, error) {
+	if err := opts.validate(); err != nil {
+		return nil, err
+	}
+	return &multiBitmap{multiBitmapOptions: opts}, nil
+}
+
+func (i *multiBitmap) Contains(id postings.ID) bool {
+	// Note: (Performance) Contains isn't used in the query path
+	// so not important how fast this implementation is.
+	switch i.op { // combineOp validated at creation, ignore invalid.
+	case multiBitmapOpUnion:
+		for _, b := range i.union {
+			if b.Contains(id) {
+				return true
+			}
+		}
+		return false
+	case multiBitmapOpIntersect:
+		for _, b := range i.intersect {
+			if !b.Contains(id) {
+				return false
+			}
+		}
+		for _, b := range i.intersectNegate {
+			if b.Contains(id) {
+				return false
+			}
+		}
+		// Only valid if all intersecting actually matched,
+		// if zero intersecting then postings does not contain ID.
+		return len(i.intersect) > 0
+	}
+	return false
+}
+
+func (i *multiBitmap) IsEmpty() bool {
+	iter := i.Iterator()
+	hasAny := iter.Next()
+	_ = iter.Err()
+	_ = iter.Close()
+	return hasAny
+}
+
+func (i *multiBitmap) CountFast() (int, bool) {
+	// We only know length after iterating.
+	return 0, false
+}
+
+func (i *multiBitmap) CountSlow() int {
+	return postings.CountSlow(i)
+}
+
+func (i *multiBitmap) Iterator() postings.Iterator {
+	return newMultiBitmapIterator(i.multiBitmapOptions)
+}
+
+func (i *multiBitmap) containerIterator() containerIterator {
+	return newMultiBitmapContainersIterator(i.multiBitmapOptions)
+}
+
+func (i *multiBitmap) Equal(other postings.List) bool {
+	return postings.Equal(i, other)
+}
+
+var _ postings.Iterator = (*multiBitmapIterator)(nil)
+
+type multiBitmapIterator struct {
+	multiBitmapOptions
+
+	iters              []containerIteratorAndOp
+	filtered           []containerIteratorAndOp
+	multiContainerIter multiBitmapContainerIterator
+	bitmap             *bitmapContainer
+	bitmapIter         bitmapContainerIterator
+	tempBitmap         *bitmapContainer
+}
+
+type containerIteratorAndOp struct {
+	it containerIterator
+	op multiContainerOp
+}
+
+type multiContainerOp uint8
+
+const (
+	multiContainerOpUnion multiContainerOp = iota
+	multiContainerOpIntersect
+	multiContainerOpNegate
+)
+
+type containerIterator interface {
+	NextContainer() bool
+	ContainerKey() uint64
+	ContainerUnion(ctx containerOpContext, target *bitmapContainer)
+	ContainerIntersect(ctx containerOpContext, target *bitmapContainer)
+	ContainerNegate(ctx containerOpContext, target *bitmapContainer)
+}
+
+type containerOpContext struct {
+	// siblings is how many other containers at this container there is
+	// being operated on.
+	siblings int
+	// tempBitmap is useful for temporary scratch operations and allows
+	// for all sub-operations to share it rather than one per underlying
+	// container iterator.
+	tempBitmap *bitmapContainer
+}
+
+func newMultiBitmapIterator(
+	opts multiBitmapOptions,
+) *multiBitmapIterator {
+	var (
+		n     = len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
+		iters = make([]containerIteratorAndOp, 0, n)
+	)
+	iters = appendContainerItersWithOp(iters, opts.union, multiContainerOpUnion)
+	iters = appendContainerItersWithOp(iters, opts.intersect, multiContainerOpIntersect)
+	iters = appendContainerItersWithOp(iters, opts.intersectNegate, multiContainerOpNegate)
+	i := &multiBitmapIterator{
+		multiBitmapOptions: opts,
+		iters:              iters,
+		bitmap:             getBitmapContainer(),
+		tempBitmap:         getBitmapContainer(),
+	}
+	i.bitmapIter.Reset(0, i.bitmap)
+	return i
+}
+
+func appendContainerItersWithOp(
+	slice []containerIteratorAndOp,
+	iterables []multiBitmapIterable,
+	op multiContainerOp,
+) []containerIteratorAndOp {
+	for _, elem := range iterables {
+		var it containerIterator
+		switch {
+		case elem.multiBitmap != nil:
+			it = elem.multiBitmap.containerIterator()
+
+		case elem.bitmap != nil:
+			it = elem.bitmap.containerIterator()
+		}
+
+		if !it.NextContainer() {
+			continue
+		}
+
+		slice = append(slice, containerIteratorAndOp{
+			it: it,
+			op: op,
+		})
+	}
+	return slice
+}
+
+func (i *multiBitmapIterator) Next() bool {
+	for !i.bitmapIter.Next() {
+		// Reset to next containers.
+		var ok bool
+		i.iters, ok = i.multiContainerIter.resetAndReturnValid(i.iters)
+		if !ok {
+			// Entirely exhausted valid iterators.
+			return false
+		}
+
+		// Combine all current containers into single bitmap.
+		switch i.op { // Op is already validated at creation time.
+		case multiBitmapOpUnion:
+			// Start bitmap as unset.
+			i.bitmap.Reset(false)
+
+			// All are unions.
+			unions := i.filter(i.multiContainerIter.containerIters, multiContainerOpUnion)
+			ctx := containerOpContext{
+				siblings:   len(unions) - 1,
+				tempBitmap: i.tempBitmap,
+			}
+			for _, iter := range unions {
+				iter.it.ContainerUnion(ctx, i.bitmap)
+			}
+		case multiBitmapOpIntersect:
+			// Start bitmap as set, guaranteed to have one intersect call.
+			i.bitmap.Reset(true)
+
+			intersects := i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect)
+			negates := i.filter(i.multiContainerIter.containerIters, multiContainerOpNegate)
+			ctx := containerOpContext{
+				siblings:   len(intersects) + len(negates) - 1,
+				tempBitmap: i.tempBitmap,
+			}
+			// Perform intersects.
+			for _, iter := range intersects {
+				iter.it.ContainerIntersect(ctx, i.bitmap)
+			}
+			// Now perform negations.
+			for _, iter := range negates {
+				iter.it.ContainerNegate(ctx, i.bitmap)
+			}
+		}
+
+		// Reset the bitmap iterator to read from new bitmap with container key.
+		i.bitmapIter.Reset(i.multiContainerIter.containerKey, i.bitmap)
+	}
+
+	// Otherwise multi container iterator has next value.
+	return true
+}
+
+func (i *multiBitmapIterator) filter(
+	iters []containerIteratorAndOp,
+	op multiContainerOp,
+) []containerIteratorAndOp {
+	// Reuse filter slice.
+	if i.filtered == nil {
+		// Alloc at longest possible slice, which is total iters
+		// created for the multi bitmap iterator.
+		i.filtered = make([]containerIteratorAndOp, 0, len(i.iters))
+	}
+	i.filtered = i.filtered[:0]
+	for _, iter := range iters {
+		if iter.op == op {
+			i.filtered = append(i.filtered, iter)
+		}
+	}
+	return i.filtered
+}
+
+func (i *multiBitmapIterator) Current() postings.ID {
+	return postings.ID(i.bitmapIter.Current())
+}
+
+func (i *multiBitmapIterator) Err() error {
+	return nil
+}
+
+func (i *multiBitmapIterator) Close() error {
+	// Return bitmaps to pool.
+	putBitmapContainer(i.bitmap)
+	i.bitmap = nil
+	putBitmapContainer(i.tempBitmap)
+	i.tempBitmap = nil
+	// No longer reference the bitmap from iterator.
+	i.bitmapIter.Reset(0, nil)
+	return nil
+}
+
+type multiBitmapContainerIterator struct {
+	containerIters []containerIteratorAndOp
+	containerKey   uint64
+
+	hasPrevContainerKey bool
+}
+
+func (i *multiBitmapContainerIterator) resetAndReturnValid(
+	input []containerIteratorAndOp,
+) ([]containerIteratorAndOp, bool) {
+	// Reset current state.
+	i.containerIters = i.containerIters[:0]
+
+	var (
+		// Track valid and reuse input slice.
+		valid            = input[:0]
+		nextContainerKey uint64
+	)
+	for _, iter := range input {
+		iterContainerKey := iter.it.ContainerKey()
+		if i.hasPrevContainerKey && iterContainerKey == i.containerKey {
+			// Consequent iteration, bump to next container as needs to progress.
+			if !iter.it.NextContainer() {
+				// Don't include.
+				continue
+			}
+			// Get next container key.
+			iterContainerKey = iter.it.ContainerKey()
+		}
+
+		// First iteration, lowest wins, everything always valid.
+		valid = append(valid, iter)
+
+		if len(i.containerIters) == 0 || iterContainerKey < nextContainerKey {
+			// First or new lowest.
+			i.containerIters = append(i.containerIters[:0], iter)
+			nextContainerKey = iterContainerKey
+		} else if iterContainerKey == nextContainerKey {
+			// Enqueue if same.
+			i.containerIters = append(i.containerIters, iter)
+		}
+	}
+
+	i.containerKey = nextContainerKey
+	i.hasPrevContainerKey = true
+
+	return valid, len(valid) > 0
+}
+
+var _ containerIterator = (*multiBitmapContainersIterator)(nil)
+
+type multiBitmapContainersIterator struct {
+	multiBitmapOptions
+
+	iters              []containerIteratorAndOp
+	multiContainerIter multiBitmapContainerIterator
+	first              bool
+}
+
+func newMultiBitmapContainersIterator(
+	opts multiBitmapOptions,
+) *multiBitmapContainersIterator {
+	var (
+		n     = len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
+		iters = make([]containerIteratorAndOp, 0, n)
+	)
+	iters = appendContainerItersWithOp(iters, opts.union, multiContainerOpUnion)
+	iters = appendContainerItersWithOp(iters, opts.intersect, multiContainerOpIntersect)
+	iters = appendContainerItersWithOp(iters, opts.intersectNegate, multiContainerOpNegate)
+	return &multiBitmapContainersIterator{
+		multiBitmapOptions: opts,
+	}
+}
+
+func (i *multiBitmapContainersIterator) NextContainer() bool {
+	if len(i.iters) != 0 {
+		// Exhausted.
+		return true
+	}
+
+	if i.first {
+		// Always have some valid iterators since we wouldn't
+		// have enqueued if not.
+		i.first = false
+		return true
+	}
+
+	var ok bool
+	i.iters, ok = i.multiContainerIter.resetAndReturnValid(i.iters)
+	if !ok {
+		// Exhausted.
+		return false
+	}
+
+	return true
+}
+
+func (i *multiBitmapContainersIterator) ContainerKey() uint64 {
+	return i.multiContainerIter.containerKey
+}
+
+func (i *multiBitmapContainersIterator) ContainerUnion(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	switch i.op { // Validated at creation
+	case multiBitmapOpUnion:
+		// Can just blindly union into target since also a union.
+		for _, iter := range i.multiContainerIter.containerIters {
+			if iter.op == multiContainerOpUnion {
+				iter.it.ContainerUnion(ctx, target)
+			}
+		}
+	case multiBitmapOpIntersect:
+		// Need to build intermediate and union with target.
+		// Note: Cannot use ctx.tempBitmap here since downstream
+		// may use it when we call iter.it.ContainerFoo(...) so
+		// we use a specific intermediary here.
+		tempBitmap := i.getTempIntersectAndNegate(ctx)
+		defer putBitmapContainer(tempBitmap)
+
+		unionBitmapInPlace(target.bitmap, tempBitmap.bitmap)
+	}
+}
+
+func (i *multiBitmapContainersIterator) ContainerIntersect(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	switch i.op { // Validated at creation
+	case multiBitmapOpUnion:
+		// Need to build intermediate and intersect with target.
+		// Note: Cannot use ctx.tempBitmap here since downstream
+		// may use it when we call iter.it.ContainerFoo(...) so
+		// we use a specific intermediary here.
+		tempBitmap := i.getTempUnion(ctx)
+		defer putBitmapContainer(tempBitmap)
+
+		intersectBitmapInPlace(target.bitmap, tempBitmap.bitmap)
+	case multiBitmapOpIntersect:
+		// Need to build intermediate and intersect with target.
+		// Note: Cannot use ctx.tempBitmap here since downstream
+		// may use it when we call iter.it.ContainerFoo(...) so
+		// we use a specific intermediary here.
+		tempBitmap := i.getTempIntersectAndNegate(ctx)
+		defer putBitmapContainer(tempBitmap)
+
+		intersectBitmapInPlace(target.bitmap, tempBitmap.bitmap)
+	}
+}
+
+func (i *multiBitmapContainersIterator) ContainerNegate(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	switch i.op { // Validated at creation
+	case multiBitmapOpUnion:
+		// Need to build intermediate and intersect with target.
+		// Note: Cannot use ctx.tempBitmap here since downstream
+		// may use it when we call iter.it.ContainerFoo(...) so
+		// we use a specific intermediary here.
+		tempBitmap := i.getTempUnion(ctx)
+		defer putBitmapContainer(tempBitmap)
+
+		differenceBitmapInPlace(target.bitmap, tempBitmap.bitmap)
+	case multiBitmapOpIntersect:
+		// Need to build intermediate and intersect with target.
+		// Note: Cannot use ctx.tempBitmap here since downstream
+		// may use it when we call iter.it.ContainerFoo(...) so
+		// we use a specific intermediary here.
+		tempBitmap := i.getTempIntersectAndNegate(ctx)
+		defer putBitmapContainer(tempBitmap)
+
+		differenceBitmapInPlace(target.bitmap, tempBitmap.bitmap)
+	}
+}
+
+func (i *multiBitmapContainersIterator) getTempUnion(
+	ctx containerOpContext,
+) *bitmapContainer {
+	tempBitmap := getBitmapContainer()
+	for _, iter := range i.multiContainerIter.containerIters {
+		if iter.op == multiContainerOpUnion {
+			iter.it.ContainerUnion(ctx, tempBitmap)
+		}
+	}
+	return tempBitmap
+}
+
+func (i *multiBitmapContainersIterator) getTempIntersectAndNegate(
+	ctx containerOpContext,
+) *bitmapContainer {
+	tempBitmap := getBitmapContainer()
+	for _, iter := range i.multiContainerIter.containerIters {
+		if iter.op == multiContainerOpIntersect {
+			iter.it.ContainerIntersect(ctx, tempBitmap)
+		}
+	}
+	for _, iter := range i.multiContainerIter.containerIters {
+		if iter.op == multiContainerOpIntersect {
+			iter.it.ContainerNegate(ctx, tempBitmap)
+		}
+	}
+	return tempBitmap
+}
+
+// Very small isolated bitmap container pool, since in reality
+// if you are looping over a lot of postings lists as long as you
+// iterate each one, then progress to next they shouldn't all need
+// a lot around and each bitmap is expensive.
+var bitmapContainerPool = sync.Pool{
+	New: func() interface{} {
+		return newBitmapContainer()
+	},
+}
+
+func getBitmapContainer() *bitmapContainer {
+	v := bitmapContainerPool.Get().(*bitmapContainer)
+	v.Reset(false)
+	return v
+}
+
+func putBitmapContainer(v *bitmapContainer) {
+	bitmapContainerPool.Put(v)
+}
+
+type bitmapContainer struct {
+	// allocated is the allocated slice used for intermediate results.
+	allocated []uint64
+	// bitmap is the current bitmap, sometimes used to refer to
+	// an external bitmap instead of the local allocated one.
+	// NB(r): This is so if there's only a single bitmap for union
+	// or intersect operation it doesn't need to copy the origin
+	// bitmap to the intermediate results.
+	bitmap []uint64
+}
+
+func newBitmapContainer() *bitmapContainer {
+	return &bitmapContainer{allocated: make([]uint64, bitmapN)}
+}
+
+func (b *bitmapContainer) Reset(set bool) {
+	if !set {
+		// Make sure "0" is the default value allocated here
+		// so this is compiled into a memset optimization.
+		for i := range b.allocated {
+			b.allocated[i] = 0
+		}
+	} else {
+		// Manually unroll loop to make it a little faster.
+		for i := 0; i < bitmapN; i += 4 {
+			b.allocated[i] = 1
+			b.allocated[i+1] = 1
+			b.allocated[i+2] = 1
+			b.allocated[i+3] = 1
+		}
+	}
+
+	// Always set curr to the current allocated slice.
+	b.bitmap = b.allocated
+}
+
+func (b *bitmapContainer) SetReadOnly(curr []uint64) {
+	// SetReadOnly should be used with care, only for single bitmap
+	// iteration.
+	b.bitmap = curr
+}
+
+type bitmapContainerIterator struct {
+	containerKey     uint64
+	bitmap           *bitmapContainer
+	bitmapCurr       uint64
+	bitmapCurrBase   uint64
+	bitmapCurrShifts uint64
+	entryIndex       int
+	currValue        uint64
+}
+
+func (i *bitmapContainerIterator) Reset(
+	containerKey uint64,
+	bitmap *bitmapContainer,
+) {
+	*i = bitmapContainerIterator{}
+	i.containerKey = containerKey
+	i.bitmap = bitmap
+	i.entryIndex = -1
+}
+
+func (i *bitmapContainerIterator) Next() bool {
+	// Bitmap container.
+	for i.bitmapCurr == 0 {
+		// All zero bits, progress to next uint64.
+		i.entryIndex++
+		if i.entryIndex >= len(i.bitmap.bitmap) {
+			// Exhausted.
+			return false
+		}
+
+		i.bitmapCurr = i.bitmap.bitmap[i.entryIndex]
+		i.bitmapCurrBase = uint64(64 * i.entryIndex)
+		i.bitmapCurrShifts = 0
+	}
+
+	// Non-zero bitmap uint64, work out next bit set and add together with
+	// base and current shifts made within this bitmap.
+	firstBitSet := uint64(bits.TrailingZeros64(i.bitmapCurr))
+	bitmapValue := i.bitmapCurrBase +
+		i.bitmapCurrShifts +
+		firstBitSet
+
+	// Now shift for the next value.
+	shifts := firstBitSet + 1
+	i.bitmapCurr = i.bitmapCurr >> shifts
+	i.bitmapCurrShifts += shifts
+
+	i.currValue = i.containerKey<<16 | bitmapValue
+	return true
+}
+
+func (i *bitmapContainerIterator) Current() uint64 {
+	return i.currValue
+}
diff --git a/src/m3ninx/postings/roaring/roaring.go b/src/m3ninx/postings/roaring/roaring.go
index 972dcf684d..714bb62729 100644
--- a/src/m3ninx/postings/roaring/roaring.go
+++ b/src/m3ninx/postings/roaring/roaring.go
@@ -101,7 +101,6 @@ func (d *postingsList) Intersect(other postings.List) error {
 	if !ok {
 		return errIntersectRoaringOnly
 	}
-
 	d.bitmap = d.bitmap.Intersect(o.bitmap)
 	return nil
 }
@@ -179,7 +178,11 @@ func (d *postingsList) IsEmpty() bool {
 	return d.bitmap.Count() == 0
 }
 
-func (d *postingsList) Len() int {
+func (d *postingsList) CountFast() (int, bool) {
+	return int(d.bitmap.Count()), true
+}
+
+func (d *postingsList) CountSlow() int {
 	return int(d.bitmap.Count())
 }
 
@@ -200,23 +203,7 @@ func (d *postingsList) Clone() postings.MutableList {
 }
 
 func (d *postingsList) Equal(other postings.List) bool {
-	if d.Len() != other.Len() {
-		return false
-	}
-
-	iter := d.Iterator()
-	otherIter := other.Iterator()
-
-	for iter.Next() {
-		if !otherIter.Next() {
-			return false
-		}
-		if iter.Current() != otherIter.Current() {
-			return false
-		}
-	}
-
-	return true
+	return postings.Equal(d, other)
 }
 
 type roaringIterator struct {
diff --git a/src/m3ninx/postings/types.go b/src/m3ninx/postings/types.go
index df8d05f88e..068db502d0 100644
--- a/src/m3ninx/postings/types.go
+++ b/src/m3ninx/postings/types.go
@@ -52,8 +52,14 @@ type List interface {
 	// calculating the size of the postings list.
 	IsEmpty() bool
 
-	// Len returns the numbers of IDs in the postings list.
-	Len() int
+	// CountFast returns a count of cardinality quickly if available, returns
+	// false otherwise.
+	CountFast() (int, bool)
+
+	// CountSlow should be called when CountFast returns false and a count
+	// is still required, it will fallback to iterating over the posting lists
+	// and counting how many entries there were during an iteration.
+	CountSlow() int
 
 	// Iterator returns an iterator over the IDs in the postings list.
 	Iterator() Iterator
diff --git a/src/m3ninx/search/executor/iterator.go b/src/m3ninx/search/executor/iterator.go
index 8289b28c5c..3be5506a09 100644
--- a/src/m3ninx/search/executor/iterator.go
+++ b/src/m3ninx/search/executor/iterator.go
@@ -21,8 +21,6 @@
 package executor
 
 import (
-	"fmt"
-
 	"github.com/m3db/m3/src/m3ninx/doc"
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -119,17 +117,10 @@ func (it *iterator) nextIter() (doc.Iterator, bool, error) {
 	}
 
 	reader := it.readers[it.idx]
-	pl, iter, err := it.searcher.Search(reader)
+	pl, err := it.searcher.Search(reader)
 	if err != nil {
 		return nil, false, err
 	}
 
-	if pl != nil && iter == nil {
-		iter = pl.Iterator()
-	}
-	if iter == nil {
-		return nil, false, fmt.Errorf("no postings list or iterator returned")
-	}
-
-	return index.NewIDDocIterator(reader, iter), true, nil
+	return index.NewIDDocIterator(reader, pl.Iterator()), true, nil
 }
diff --git a/src/m3ninx/search/searcher/all.go b/src/m3ninx/search/searcher/all.go
index a670f868f1..43b1b9d128 100644
--- a/src/m3ninx/search/searcher/all.go
+++ b/src/m3ninx/search/searcher/all.go
@@ -33,10 +33,6 @@ func NewAllSearcher() search.Searcher {
 	return &all{}
 }
 
-func (s *all) Search(r index.Reader) (postings.List, postings.Iterator, error) {
-	pl, err := r.MatchAll()
-	if err != nil {
-		return nil, nil, err
-	}
-	return pl, nil, nil
+func (s *all) Search(r index.Reader) (postings.List, error) {
+	return r.MatchAll()
 }
diff --git a/src/m3ninx/search/searcher/conjunction.go b/src/m3ninx/search/searcher/conjunction.go
index c0dda0a42e..ac5345b777 100644
--- a/src/m3ninx/search/searcher/conjunction.go
+++ b/src/m3ninx/search/searcher/conjunction.go
@@ -25,6 +25,7 @@ import (
 
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/search"
 )
 
@@ -46,38 +47,36 @@ func NewConjunctionSearcher(searchers, negations search.Searchers) (search.Searc
 	}, nil
 }
 
-func (s *conjunctionSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+func (s *conjunctionSearcher) Search(r index.Reader) (postings.List, error) {
 	var (
 		intersects = make([]postings.List, 0, len(s.searchers))
 		negations  = make([]postings.List, 0, len(s.negations))
 	)
 	for _, sr := range s.searchers {
-		pl, _, err := sr.Search(r)
+		pl, err := sr.Search(r)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 		if pl == nil {
-			return nil, nil, fmt.Errorf("conjunction searchers must resolve postings lists")
+			return nil, fmt.Errorf("conjunction searchers must resolve postings lists")
 		}
 
 		intersects = append(intersects, pl)
 	}
 
 	for _, sr := range s.negations {
-		pl, _, err := sr.Search(r)
+		pl, err := sr.Search(r)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 		if pl == nil {
-			return nil, nil, fmt.Errorf("conjunction searchers must resolve postings lists")
+			return nil, fmt.Errorf("conjunction searchers must resolve postings lists")
 		}
 
 		negations = append(negations, pl)
 	}
 
-	iter, err := newIntersectAndNegatePostingsListIter(intersects, negations)
-	if err != nil {
-		return nil, nil, err
-	}
-	return nil, iter, nil
+	// Perform a lazy fast intersect and negate.
+	// TODO: Try and see if returns err, if so fallback to slower method?
+	return roaring.IntersectAndNegateReadOnly(intersects, negations)
 }
diff --git a/src/m3ninx/search/searcher/disjunction.go b/src/m3ninx/search/searcher/disjunction.go
index 264c951744..d539b58dce 100644
--- a/src/m3ninx/search/searcher/disjunction.go
+++ b/src/m3ninx/search/searcher/disjunction.go
@@ -21,8 +21,6 @@
 package searcher
 
 import (
-	"fmt"
-
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
@@ -45,26 +43,20 @@ func NewDisjunctionSearcher(searchers search.Searchers) (search.Searcher, error)
 	}, nil
 }
 
-func (s *disjunctionSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+func (s *disjunctionSearcher) Search(r index.Reader) (postings.List, error) {
 	var (
 		union = make([]postings.List, 0, len(s.searchers))
 	)
 	for _, sr := range s.searchers {
-		pl, _, err := sr.Search(r)
+		pl, err := sr.Search(r)
 		if err != nil {
-			return nil, nil, err
-		}
-		if pl == nil {
-			return nil, nil, fmt.Errorf("disjunction searchers must resolve postings lists")
+			return nil, err
 		}
 
 		union = append(union, pl)
 	}
 
-	pl, err := roaring.Union(union)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	return pl, nil, nil
+	// Perform a lazy fast union.
+	// TODO: Try and see if returns err, if so fallback to slower method?
+	return roaring.UnionReadOnly(union)
 }
diff --git a/src/m3ninx/search/searcher/empty.go b/src/m3ninx/search/searcher/empty.go
index e6ffb67e6f..edcab8f936 100644
--- a/src/m3ninx/search/searcher/empty.go
+++ b/src/m3ninx/search/searcher/empty.go
@@ -28,16 +28,13 @@ import (
 )
 
 type emptySearcher struct {
-	postings postings.List
 }
 
 // NewEmptySearcher returns a new searcher which always returns an empty postings list.
 func NewEmptySearcher() search.Searcher {
-	return &emptySearcher{
-		postings: roaring.NewPostingsList(),
-	}
+	return &emptySearcher{}
 }
 
-func (s *emptySearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
-	return s.postings, nil, nil
+func (s *emptySearcher) Search(r index.Reader) (postings.List, error) {
+	return roaring.NewReadOnlyBitmap(nil)
 }
diff --git a/src/m3ninx/search/searcher/field.go b/src/m3ninx/search/searcher/field.go
index 3bbe0343d8..20446fcf2d 100644
--- a/src/m3ninx/search/searcher/field.go
+++ b/src/m3ninx/search/searcher/field.go
@@ -37,10 +37,6 @@ func NewFieldSearcher(field []byte) (search.Searcher, error) {
 	}, nil
 }
 
-func (s *fieldSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
-	pl, err := r.MatchField(s.field)
-	if err != nil {
-		return nil, nil, err
-	}
-	return pl, nil, nil
+func (s *fieldSearcher) Search(r index.Reader) (postings.List, error) {
+	return r.MatchField(s.field)
 }
diff --git a/src/m3ninx/search/searcher/lazy_postings_list.go b/src/m3ninx/search/searcher/lazy_postings_list.go
deleted file mode 100644
index ca22ab3778..0000000000
--- a/src/m3ninx/search/searcher/lazy_postings_list.go
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-package searcher
-
-import (
-	"errors"
-	"sort"
-
-	"github.com/m3db/m3/src/m3ninx/postings"
-)
-
-var (
-	errNoPostingsLists = errors.New("no postings lists")
-)
-
-var _ postings.Iterator = (*intersectAndNegatePostingsListIter)(nil)
-
-type intersectAndNegatePostingsListIter struct {
-	smallestIntersectIter    postings.Iterator
-	nonSmallestIntersectsAsc []postings.List
-	negationsDesc            []postings.List
-	current                  postings.ID
-}
-
-func newIntersectAndNegatePostingsListIter(
-	intersects []postings.List,
-	negations []postings.List,
-) (postings.Iterator, error) {
-	if len(intersects) == 0 {
-		return nil, errNoPostingsLists
-	}
-
-	// Always intersect using the smallest at top so it can
-	// directly compare if intersected with other results from
-	// other lists.
-	sort.Slice(intersects, func(i, j int) bool {
-		return intersects[i].Len() < intersects[j].Len()
-	})
-	sort.Slice(negations, func(i, j int) bool {
-		return negations[i].Len() > negations[j].Len()
-	})
-	return &intersectAndNegatePostingsListIter{
-		smallestIntersectIter:    intersects[0].Iterator(),
-		nonSmallestIntersectsAsc: intersects[1:],
-		negationsDesc:            negations,
-		current:                  postings.MaxID,
-	}, nil
-}
-
-func (it *intersectAndNegatePostingsListIter) Current() postings.ID {
-	return it.current
-}
-
-func (it *intersectAndNegatePostingsListIter) Next() bool {
-NextValue:
-	for {
-		if !it.smallestIntersectIter.Next() {
-			return false
-		}
-		curr := it.smallestIntersectIter.Current()
-		for _, list := range it.nonSmallestIntersectsAsc {
-			if !list.Contains(curr) {
-				continue NextValue
-			}
-		}
-		for _, list := range it.negationsDesc {
-			if list.Contains(curr) {
-				continue NextValue
-			}
-		}
-		it.current = curr
-		return true
-	}
-}
-
-func (it *intersectAndNegatePostingsListIter) Err() error {
-	return nil
-}
-
-func (it *intersectAndNegatePostingsListIter) Close() error {
-	return nil
-}
diff --git a/src/m3ninx/search/searcher/negation.go b/src/m3ninx/search/searcher/negation.go
index 328d7f6dc5..62c5c191c1 100644
--- a/src/m3ninx/search/searcher/negation.go
+++ b/src/m3ninx/search/searcher/negation.go
@@ -23,6 +23,7 @@ package searcher
 import (
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/search"
 )
 
@@ -38,17 +39,20 @@ func NewNegationSearcher(s search.Searcher) (search.Searcher, error) {
 	}, nil
 }
 
-func (s *negationSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
+func (s *negationSearcher) Search(r index.Reader) (postings.List, error) {
 	pl, err := r.MatchAll()
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 
-	sPl, _, err := s.searcher.Search(r)
+	negatePl, err := s.searcher.Search(r)
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 
-	pl.Difference(sPl)
-	return pl, nil, nil
+	// Perform a lazy fast intersect and negate.
+	// TODO: Try and see if returns err, if so fallback to slower method?
+	intersects := []postings.List{pl}
+	negations := []postings.List{negatePl}
+	return roaring.IntersectAndNegateReadOnly(intersects, negations)
 }
diff --git a/src/m3ninx/search/searcher/regexp.go b/src/m3ninx/search/searcher/regexp.go
index 8141f2597b..f4c1688d95 100644
--- a/src/m3ninx/search/searcher/regexp.go
+++ b/src/m3ninx/search/searcher/regexp.go
@@ -40,10 +40,6 @@ func NewRegexpSearcher(field []byte, compiled index.CompiledRegex) search.Search
 	}
 }
 
-func (s *regexpSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
-	pl, err := r.MatchRegexp(s.field, s.compiled)
-	if err != nil {
-		return nil, nil, err
-	}
-	return pl, nil, nil
+func (s *regexpSearcher) Search(r index.Reader) (postings.List, error) {
+	return r.MatchRegexp(s.field, s.compiled)
 }
diff --git a/src/m3ninx/search/searcher/term.go b/src/m3ninx/search/searcher/term.go
index f9e4da1dbb..b550e1211f 100644
--- a/src/m3ninx/search/searcher/term.go
+++ b/src/m3ninx/search/searcher/term.go
@@ -38,10 +38,6 @@ func NewTermSearcher(field, term []byte) search.Searcher {
 	}
 }
 
-func (s *termSearcher) Search(r index.Reader) (postings.List, postings.Iterator, error) {
-	pl, err := r.MatchTerm(s.field, s.term)
-	if err != nil {
-		return nil, nil, err
-	}
-	return pl, nil, nil
+func (s *termSearcher) Search(r index.Reader) (postings.List, error) {
+	return r.MatchTerm(s.field, s.term)
 }
diff --git a/src/m3ninx/search/types.go b/src/m3ninx/search/types.go
index cf233a25ae..d4fc76b4ab 100644
--- a/src/m3ninx/search/types.go
+++ b/src/m3ninx/search/types.go
@@ -56,7 +56,7 @@ type Query interface {
 // of the documents it matches for the given segment.
 type Searcher interface {
 	// Search executes a configured query against the given Reader.
-	Search(index.Reader) (postings.List, postings.Iterator, error)
+	Search(index.Reader) (postings.List, error)
 }
 
 // Searchers is a slice of Searcher.

From 5208869094a607239ca015bd083f42e6ea03408e Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 27 Oct 2020 17:36:56 -0400
Subject: [PATCH 006/106] Add tests

---
 ...only_multi.go => bitmap_multi_readonly.go} |  98 ++++--
 .../roaring/bitmap_multi_readonly_test.go     | 289 ++++++++++++++++++
 .../postings/roaring/bitmap_readonly.go       | 158 ++++++++--
 .../postings/roaring/bitmap_readonly_test.go  |   3 +
 src/m3ninx/postings/roaring/roaring_test.go   | 154 +++++-----
 src/m3ninx/search/proptest/concurrent_test.go |  17 +-
 src/m3ninx/search/searcher/conjunction.go     |   8 -
 7 files changed, 587 insertions(+), 140 deletions(-)
 rename src/m3ninx/postings/roaring/{bitmap_readonly_multi.go => bitmap_multi_readonly.go} (91%)
 create mode 100644 src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go

diff --git a/src/m3ninx/postings/roaring/bitmap_readonly_multi.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
similarity index 91%
rename from src/m3ninx/postings/roaring/bitmap_readonly_multi.go
rename to src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 84bcd30c37..306f2c2f82 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly_multi.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -45,8 +45,9 @@ func UnionReadOnly(unions []postings.List) (postings.List, error) {
 		}
 
 		mb, ok := elem.(*multiBitmap)
-		if !ok {
+		if ok {
 			union = append(union, multiBitmapIterable{multiBitmap: mb})
+			continue
 		}
 
 		return nil, ErrNotReadOnlyBitmaps
@@ -72,8 +73,9 @@ func IntersectAndNegateReadOnly(
 		}
 
 		mb, ok := elem.(*multiBitmap)
-		if !ok {
+		if ok {
 			intersect = append(intersect, multiBitmapIterable{multiBitmap: mb})
+			continue
 		}
 
 		return nil, ErrNotReadOnlyBitmaps
@@ -88,8 +90,9 @@ func IntersectAndNegateReadOnly(
 		}
 
 		mb, ok := elem.(*multiBitmap)
-		if !ok {
+		if ok {
 			negate = append(negate, multiBitmapIterable{multiBitmap: mb})
+			continue
 		}
 
 		return nil, ErrNotReadOnlyBitmaps
@@ -156,7 +159,20 @@ type multiBitmapOptions struct {
 }
 
 func (o multiBitmapOptions) validate() error {
-	return o.op.validate()
+	if err := o.op.validate(); err != nil {
+		return err
+	}
+	switch o.op {
+	case multiBitmapOpUnion:
+		if n := len(o.union); n == 0 {
+			return fmt.Errorf("union requires union postings: actual=%d", n)
+		}
+	case multiBitmapOpIntersect:
+		if n := len(o.intersect); n == 0 {
+			return fmt.Errorf("intersect requires intersect postings: actual=%d", n)
+		}
+	}
+	return nil
 }
 
 func newMultiBitmap(opts multiBitmapOptions) (*multiBitmap, error) {
@@ -229,6 +245,7 @@ var _ postings.Iterator = (*multiBitmapIterator)(nil)
 type multiBitmapIterator struct {
 	multiBitmapOptions
 
+	err                error
 	iters              []containerIteratorAndOp
 	filtered           []containerIteratorAndOp
 	multiContainerIter multiBitmapContainerIterator
@@ -256,6 +273,8 @@ type containerIterator interface {
 	ContainerUnion(ctx containerOpContext, target *bitmapContainer)
 	ContainerIntersect(ctx containerOpContext, target *bitmapContainer)
 	ContainerNegate(ctx containerOpContext, target *bitmapContainer)
+	Err() error
+	Close()
 }
 
 type containerOpContext struct {
@@ -316,10 +335,21 @@ func appendContainerItersWithOp(
 }
 
 func (i *multiBitmapIterator) Next() bool {
+	if i.err != nil {
+		return false
+	}
+
 	for !i.bitmapIter.Next() {
 		// Reset to next containers.
-		var ok bool
-		i.iters, ok = i.multiContainerIter.resetAndReturnValid(i.iters)
+		var (
+			ok  bool
+			err error
+		)
+		i.iters, ok, err = i.multiContainerIter.resetAndReturnValid(i.iters)
+		if err != nil {
+			i.err = err
+			return false
+		}
 		if !ok {
 			// Entirely exhausted valid iterators.
 			return false
@@ -344,17 +374,18 @@ func (i *multiBitmapIterator) Next() bool {
 			// Start bitmap as set, guaranteed to have one intersect call.
 			i.bitmap.Reset(true)
 
-			intersects := i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect)
-			negates := i.filter(i.multiContainerIter.containerIters, multiContainerOpNegate)
 			ctx := containerOpContext{
-				siblings:   len(intersects) + len(negates) - 1,
+				siblings: len(i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect)) +
+					len(i.filter(i.multiContainerIter.containerIters, multiContainerOpNegate)) - 1,
 				tempBitmap: i.tempBitmap,
 			}
 			// Perform intersects.
+			intersects := i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect)
 			for _, iter := range intersects {
 				iter.it.ContainerIntersect(ctx, i.bitmap)
 			}
 			// Now perform negations.
+			negates := i.filter(i.multiContainerIter.containerIters, multiContainerOpNegate)
 			for _, iter := range negates {
 				iter.it.ContainerNegate(ctx, i.bitmap)
 			}
@@ -392,10 +423,15 @@ func (i *multiBitmapIterator) Current() postings.ID {
 }
 
 func (i *multiBitmapIterator) Err() error {
-	return nil
+	return i.err
 }
 
 func (i *multiBitmapIterator) Close() error {
+	// Close any iters that are left if we abort early.
+	for _, iter := range i.iters {
+		iter.it.Close()
+	}
+
 	// Return bitmaps to pool.
 	putBitmapContainer(i.bitmap)
 	i.bitmap = nil
@@ -415,7 +451,7 @@ type multiBitmapContainerIterator struct {
 
 func (i *multiBitmapContainerIterator) resetAndReturnValid(
 	input []containerIteratorAndOp,
-) ([]containerIteratorAndOp, bool) {
+) ([]containerIteratorAndOp, bool, error) {
 	// Reset current state.
 	i.containerIters = i.containerIters[:0]
 
@@ -430,8 +466,14 @@ func (i *multiBitmapContainerIterator) resetAndReturnValid(
 			// Consequent iteration, bump to next container as needs to progress.
 			if !iter.it.NextContainer() {
 				// Don't include.
+				err := iter.it.Err()
+				iter.it.Close() // Always close
+				if err != nil {
+					return nil, false, err
+				}
 				continue
 			}
+
 			// Get next container key.
 			iterContainerKey = iter.it.ContainerKey()
 		}
@@ -452,7 +494,7 @@ func (i *multiBitmapContainerIterator) resetAndReturnValid(
 	i.containerKey = nextContainerKey
 	i.hasPrevContainerKey = true
 
-	return valid, len(valid) > 0
+	return valid, len(valid) > 0, nil
 }
 
 var _ containerIterator = (*multiBitmapContainersIterator)(nil)
@@ -460,6 +502,7 @@ var _ containerIterator = (*multiBitmapContainersIterator)(nil)
 type multiBitmapContainersIterator struct {
 	multiBitmapOptions
 
+	err                error
 	iters              []containerIteratorAndOp
 	multiContainerIter multiBitmapContainerIterator
 	first              bool
@@ -481,7 +524,7 @@ func newMultiBitmapContainersIterator(
 }
 
 func (i *multiBitmapContainersIterator) NextContainer() bool {
-	if len(i.iters) != 0 {
+	if i.err != nil || len(i.iters) != 0 {
 		// Exhausted.
 		return true
 	}
@@ -493,8 +536,15 @@ func (i *multiBitmapContainersIterator) NextContainer() bool {
 		return true
 	}
 
-	var ok bool
-	i.iters, ok = i.multiContainerIter.resetAndReturnValid(i.iters)
+	var (
+		ok  bool
+		err error
+	)
+	i.iters, ok, err = i.multiContainerIter.resetAndReturnValid(i.iters)
+	if err != nil {
+		i.err = err
+		return false
+	}
 	if !ok {
 		// Exhausted.
 		return false
@@ -583,6 +633,13 @@ func (i *multiBitmapContainersIterator) ContainerNegate(
 	}
 }
 
+func (i *multiBitmapContainersIterator) Err() error {
+	return i.err
+}
+
+func (i *multiBitmapContainersIterator) Close() {
+}
+
 func (i *multiBitmapContainersIterator) getTempUnion(
 	ctx containerOpContext,
 ) *bitmapContainer {
@@ -650,17 +707,18 @@ func newBitmapContainer() *bitmapContainer {
 func (b *bitmapContainer) Reset(set bool) {
 	if !set {
 		// Make sure "0" is the default value allocated here
-		// so this is compiled into a memset optimization.
+		// so this is compiled into a memclr optimization.
+		// https://codereview.appspot.com/137880043
 		for i := range b.allocated {
 			b.allocated[i] = 0
 		}
 	} else {
 		// Manually unroll loop to make it a little faster.
 		for i := 0; i < bitmapN; i += 4 {
-			b.allocated[i] = 1
-			b.allocated[i+1] = 1
-			b.allocated[i+2] = 1
-			b.allocated[i+3] = 1
+			b.allocated[i] = maxBitmap
+			b.allocated[i+1] = maxBitmap
+			b.allocated[i+2] = maxBitmap
+			b.allocated[i+3] = maxBitmap
 		}
 	}
 
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
new file mode 100644
index 0000000000..239ad8c4a0
--- /dev/null
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
@@ -0,0 +1,289 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package roaring
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path"
+	"testing"
+
+	"github.com/m3db/m3/src/m3ninx/postings"
+
+	"github.com/m3dbx/pilosa/roaring"
+	"github.com/stretchr/testify/require"
+)
+
+func TestMultiBitmap(t *testing.T) {
+	rng := rand.New(rand.NewSource(seed))
+
+	each := 8
+	numRegular := 2
+	// numUnion := 2
+	// numNegate := 1
+	// numNegateUnion := 2
+	numUnion := 0
+	numNegate := 0
+	numNegateUnion := 0
+	tests := []struct {
+		attempts    int
+		insertCount int
+		insertRange int
+	}{
+		// 64 inserts
+		{
+			insertCount: 64,
+			insertRange: 64,
+		},
+		{
+			insertCount: 64,
+			insertRange: 128,
+		},
+		{
+			insertCount: 64,
+			insertRange: 256,
+		},
+		// 4096 inserts
+		{
+			insertCount: 4096,
+			insertRange: 4096,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 8192,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 16384,
+		},
+		// 65536 inserts
+		{
+			insertCount: 65536,
+			insertRange: 65536,
+		},
+		{
+			insertCount: 65536,
+			insertRange: 131072,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 262144,
+		},
+	}
+
+	// 2^16 containers max, will stay within [0,2^32)
+	b := roaring.NewBitmapWithDefaultPooling(2 << 15)
+	for _, test := range tests {
+		genOpts := genRandBitmapAndReadOnlyBitmapOptions{
+			rng:         rng,
+			bitmap:      b,
+			insertRange: test.insertRange,
+			insertCount: test.insertCount,
+		}
+		for i := 0; i < each; i++ {
+			t.Run(fmt.Sprintf("attempt=%d, test=+%v", i, test), func(t *testing.T) {
+				allReadOnly, err := NewReadOnlyBitmapRange(0, uint64(test.insertRange+1))
+				require.NoError(t, err)
+
+				reg, regReadOnly := genRandBitmapsAndReadOnlyBitmaps(t, numRegular, genOpts)
+				union, unionReadOnly := genRandBitmapsAndReadOnlyBitmaps(t, numUnion, genOpts)
+				negate, negateReadOnly := genRandBitmapsAndReadOnlyBitmaps(t, numNegate, genOpts)
+				negateUnion, negateUnionReadOnly := genRandBitmapsAndReadOnlyBitmaps(t, numNegateUnion, genOpts)
+
+				// First create the inner multi-bitmaps.
+				multiInner := concat(regReadOnly)
+
+				if numUnion > 0 {
+					innerUnion, err := UnionReadOnly(unionReadOnly)
+					require.NoError(t, err)
+					multiInner = append(multiInner, innerUnion)
+				}
+
+				if numNegate > 0 {
+					innerNegate, err := IntersectAndNegateReadOnly(lists(allReadOnly), negateReadOnly)
+					require.NoError(t, err)
+					multiInner = append(multiInner, innerNegate)
+				}
+
+				if numNegateUnion > 0 {
+					innerNegateUnionUnion, err := UnionReadOnly(negateUnionReadOnly)
+					require.NoError(t, err)
+					innerNegateUnion, err := IntersectAndNegateReadOnly(lists(allReadOnly), lists(innerNegateUnionUnion))
+					require.NoError(t, err)
+					multiInner = append(multiInner, innerNegateUnion)
+				}
+
+				// Create top level multi-bitmap.
+				multi, err := IntersectAndNegateReadOnly(multiInner, nil)
+				require.NoError(t, err)
+
+				// Perform same operations the old way with postings lists.
+				bitmap := roaring.NewBitmap()
+				// Make sure at least some regular postings lists are being
+				// intersected, otherwise starting with all bitmap won't be
+				// useful.
+				require.True(t, len(reg) > 0)
+				// First set all bits in the range.
+				bitmap = bitmap.Flip(0, uint64(test.insertRange))
+				// Intersect with regular bitmaps now.
+				for _, pl := range reg {
+					bitmap = bitmap.Intersect(bitmapFromPostings(t, pl))
+				}
+				// Intersect with union.
+				if numUnion > 0 {
+					pl, err := Union(union)
+					require.NoError(t, err)
+					bitmap = bitmap.Intersect(bitmapFromPostings(t, pl))
+				}
+				// Intersect with negate.
+				if numNegate > 0 {
+					for _, pl := range negate {
+						bitmap = bitmap.Difference(bitmapFromPostings(t, pl))
+					}
+				}
+				// Intersect with negate of union.
+				if numNegateUnion > 0 {
+					pl, err := Union(negateUnion)
+					require.NoError(t, err)
+					bitmap = bitmap.Difference(bitmapFromPostings(t, pl))
+				}
+				transformed := NewPostingsListFromBitmap(bitmap)
+
+				// Check for equality.
+				equal := postings.Equal(multi, transformed)
+				if !equal {
+					msg := fmt.Sprintf("multi-bitmap: %s\nstandard: %s\n",
+						postingsString(multi), postingsString(transformed))
+					if debug := os.Getenv("TEST_DEBUG_DIR"); debug != "" {
+						e0 := ioutil.WriteFile(path.Join(debug, "actual.json"), []byte(postingsJSON(t, multi)), 0666)
+						e1 := ioutil.WriteFile(path.Join(debug, "expected.json"), []byte(postingsJSON(t, transformed)), 0666)
+						require.NoError(t, e0)
+						require.NoError(t, e1)
+						msg += fmt.Sprintf("wrote debug: %s\n", debug)
+					}
+					require.FailNow(t, msg)
+				}
+
+				// Check for contains.
+				// iter := transformed.Iterator()
+				// for iter.Next() {
+				// 	curr := iter.Current()
+				// 	require.True(t, multi.Contains(curr))
+				// }
+				// require.NoError(t, iter.Err())
+				// require.NoError(t, iter.Close())
+			})
+		}
+	}
+}
+
+func bitmapFromPostings(t *testing.T, pl postings.List) *roaring.Bitmap {
+	b, ok := BitmapFromPostingsList(pl)
+	require.True(t, ok)
+	return b
+}
+
+func lists(list ...postings.List) []postings.List {
+	return list
+}
+
+func concat(lists ...[]postings.List) []postings.List {
+	var result []postings.List
+	for _, list := range lists {
+		result = append(result, list...)
+	}
+	return result
+}
+
+func genRandBitmapsAndReadOnlyBitmaps(
+	t *testing.T,
+	count int,
+	opts genRandBitmapAndReadOnlyBitmapOptions,
+) ([]postings.List, []postings.List) {
+	var regular, readOnlys []postings.List
+	for i := 0; i < count; i++ {
+		list, readOnly := genRandBitmapAndReadOnlyBitmap(t, opts)
+		regular = append(regular, list)
+		readOnlys = append(readOnlys, readOnly)
+	}
+	return regular, readOnlys
+}
+
+type genRandBitmapAndReadOnlyBitmapOptions struct {
+	rng         *rand.Rand
+	bitmap      *roaring.Bitmap
+	insertRange int
+	insertCount int
+}
+
+func genRandBitmapAndReadOnlyBitmap(
+	t *testing.T,
+	opts genRandBitmapAndReadOnlyBitmapOptions,
+) (postings.List, *ReadOnlyBitmap) {
+	opts.bitmap.Reset()
+	max := uint64(opts.rng.Int63n(int64(opts.insertRange)))
+	for j := 0; j < opts.insertCount; j++ {
+		value := opts.rng.Uint64() % max
+		opts.bitmap.DirectAdd(value)
+	}
+
+	list := NewPostingsListFromBitmap(opts.bitmap)
+
+	// Note: do not reuse buffer since read only bitmap
+	// references them.
+	buff := bytes.NewBuffer(nil)
+	_, err := opts.bitmap.WriteTo(buff)
+	require.NoError(t, err)
+
+	readOnly, err := NewReadOnlyBitmap(buff.Bytes())
+	require.NoError(t, err)
+
+	return list, readOnly
+}
+
+func postingsString(pl postings.List) string {
+	var buf bytes.Buffer
+	iter := pl.Iterator()
+	for i := 0; iter.Next(); i++ {
+		if i != 0 {
+			buf.WriteString(", ")
+		}
+		buf.WriteString(fmt.Sprintf("%d", iter.Current()))
+	}
+	return "[" + buf.String() + "]"
+}
+
+func postingsJSON(t *testing.T, pl postings.List) string {
+	var out []uint64
+	iter := pl.Iterator()
+	for i := 0; iter.Next(); i++ {
+		out = append(out, uint64(iter.Current()))
+	}
+	require.NoError(t, iter.Err())
+	require.NoError(t, iter.Close())
+	data, err := json.MarshalIndent(out, "", "  ")
+	require.NoError(t, err)
+	return string(data)
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index f3e1989d1a..40b04f7b1a 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -39,6 +39,7 @@ const (
 	bitmapN            = 1024
 	runCountHeaderSize = uint32(2)
 	containerValues    = 2 << 15 // 2^16 or 65k
+	maxBitmap          = 0xFFFFFFFFFFFFFFFF
 )
 
 var (
@@ -101,8 +102,8 @@ func (b *ReadOnlyBitmap) Reset(data []byte) error {
 	b.rangeStartInclusive = 0
 	b.rangeEndExclusive = 0
 
+	// Reset to nil.
 	if len(data) == 0 {
-		// Reset to nil
 		b.data = nil
 		b.keyN = 0
 		return nil
@@ -126,16 +127,15 @@ func (b *ReadOnlyBitmap) Reset(data []byte) error {
 	}
 
 	// Read key count in bytes sizeof(cookie):(sizeof(cookie)+sizeof(uint32)).
-	keyN := uint64(binary.LittleEndian.Uint32(data[4:8]))
+	b.keyN = uint64(binary.LittleEndian.Uint32(data[4:8]))
+	b.data = data
 
-	minBytesN := headerBaseSize + keyN*12 + keyN*4
-	if uint64(len(data)) < minBytesN {
-		return fmt.Errorf("bitmap too small: need=%d, actual=%d",
-			minBytesN, len(data))
+	// Validate all the containers.
+	for i := uint64(0); i < b.keyN; i++ {
+		if _, err := b.containerAtIndex(i); err != nil {
+			return err
+		}
 	}
-
-	b.data = data
-	b.keyN = keyN
 	return nil
 }
 
@@ -179,12 +179,46 @@ func (r runReadOnlyContainer) contains(v uint16) bool {
 	return idx < n && v >= r.values[idx].start && v <= r.values[idx].last
 }
 
+func (c readOnlyContainer) validate() error {
+	switch c.containerType {
+	case containerBitmap:
+		need := int(c.offset) + 8*bitmapN // entry uint64 bitmap 8 bytes
+		if len(c.data) < need {
+			return fmt.Errorf("data too small for bitmap: needs=%d, actual=%d",
+				need, len(c.data))
+		}
+		return nil
+	case containerArray:
+		need := int(c.offset) + 2*int(c.cardinality) // entry is uint16 2 bytes
+		if len(c.data) < need {
+			return fmt.Errorf("data too small for array: needs=%d, actual=%d",
+				need, len(c.data))
+		}
+		return nil
+	case containerRun:
+		need := int(c.offset) + int(runCountHeaderSize)
+		if len(c.data) < need {
+			return fmt.Errorf("data too small for runs header: needs=%d, actual=%d",
+				need, len(c.data))
+		}
+		runCount := binary.LittleEndian.Uint16(c.data[c.offset : c.offset+runCountHeaderSize])
+		need = int(c.offset) + int(runCountHeaderSize) + 4*int(runCount) // entry is two uint16s 4 bytes
+		if len(c.data) < need {
+			return fmt.Errorf("data too small for runs values: needs=%d, actual=%d",
+				need, len(c.data))
+		}
+		return nil
+	}
+	panic("foo")
+	return fmt.Errorf("unknown container: %d", c.containerType)
+}
+
 func (c readOnlyContainer) bitmap() (bitmapReadOnlyContainer, bool) {
 	if c.containerType != containerBitmap {
 		return bitmapReadOnlyContainer{}, false
 	}
 	return bitmapReadOnlyContainer{
-		values: (*[0xFFFFFFF]uint64)(unsafe.Pointer(&c.data[c.offset]))[:bitmapN:bitmapN],
+		values: (*[0xFFFFFFF]uint64)(unsafe.Pointer(&c.data[c.offset]))[:bitmapN],
 	}, true
 }
 
@@ -193,7 +227,7 @@ func (c readOnlyContainer) array() (arrayReadOnlyContainer, bool) {
 		return arrayReadOnlyContainer{}, false
 	}
 	return arrayReadOnlyContainer{
-		values: (*[0xFFFFFFF]uint16)(unsafe.Pointer(&c.data[c.offset]))[:c.cardinality:c.cardinality],
+		values: (*[0xFFFFFFF]uint16)(unsafe.Pointer(&c.data[c.offset]))[:c.cardinality],
 	}, true
 }
 
@@ -203,7 +237,7 @@ func (c readOnlyContainer) runs() (runReadOnlyContainer, bool) {
 	}
 	runCount := binary.LittleEndian.Uint16(c.data[c.offset : c.offset+runCountHeaderSize])
 	return runReadOnlyContainer{
-		values: (*[0xFFFFFFF]interval16)(unsafe.Pointer(&c.data[c.offset+runCountHeaderSize]))[:runCount:runCount],
+		values: (*[0xFFFFFFF]interval16)(unsafe.Pointer(&c.data[c.offset+runCountHeaderSize]))[:runCount],
 	}, true
 }
 
@@ -221,19 +255,49 @@ func (b *ReadOnlyBitmap) container(key uint64) (readOnlyContainer, bool) {
 	if !ok {
 		return readOnlyContainer{}, false
 	}
-	return b.containerAtIndex(index), true
-}
-
-func (b *ReadOnlyBitmap) containerAtIndex(index uint64) readOnlyContainer {
-	meta := b.data[headerBaseSize+index*12:]
-	offsets := b.data[headerBaseSize+b.keyN*12+index*4:]
-	return readOnlyContainer{
+	// All offsets validated at construction time, safe to ignore the
+	// error here.
+	// If we had to return an error to Contains(...) and Iterator() then
+	// we wouldn't be able to implement the API contract.
+	// Today we also have this same issue with existing mmap backed roaring
+	// bitmaps from pilosa, so it doesn't reduce or expand our risk exposure.
+	container, _ := b.containerAtIndex(index)
+	return container, true
+}
+
+func (b *ReadOnlyBitmap) containerAtIndex(index uint64) (readOnlyContainer, error) {
+	const (
+		metaTypeStart = 8
+		metaTypeEnd   = 10
+		metaCardStart = 10
+		metaCardEnd   = 12
+		offsetStart   = 0
+		offsetEnd     = 4
+	)
+	metaIdx := headerBaseSize + index*12
+	offsetIdx := headerBaseSize + b.keyN*12 + index*4
+	size := uint64(len(b.data))
+	if size < metaIdx+metaCardEnd {
+		return readOnlyContainer{}, fmt.Errorf(
+			"data too small: need=%d, actual=%d", metaIdx+metaCardEnd, size)
+	}
+	if size < offsetIdx+offsetEnd {
+		return readOnlyContainer{}, fmt.Errorf(
+			"data too small: need=%d, actual=%d", offsetIdx+offsetEnd, size)
+	}
+	meta := b.data[metaIdx:]
+	offsets := b.data[offsetIdx:]
+	container := readOnlyContainer{
 		data:          b.data,
 		key:           b.keyAtIndex(int(index)),
-		containerType: containerType(binary.LittleEndian.Uint16(meta[8:10])),
-		cardinality:   uint16(binary.LittleEndian.Uint16(meta[10:12])) + 1,
-		offset:        binary.LittleEndian.Uint32(offsets[0:4]),
+		containerType: containerType(binary.LittleEndian.Uint16(meta[metaTypeStart:metaTypeEnd])),
+		cardinality:   uint16(binary.LittleEndian.Uint16(meta[metaCardStart:metaCardEnd])) + 1,
+		offset:        binary.LittleEndian.Uint32(offsets[offsetStart:offsetEnd]),
 	}
+	if err := container.validate(); err != nil {
+		return readOnlyContainer{}, err
+	}
+	return container, nil
 }
 
 func (b *ReadOnlyBitmap) Contains(id postings.ID) bool {
@@ -270,7 +334,14 @@ func (b *ReadOnlyBitmap) IsEmpty() bool {
 func (b *ReadOnlyBitmap) count() int {
 	l := 0
 	for i := uint64(0); i < b.keyN; i++ {
-		l += int(b.containerAtIndex(i).cardinality)
+		// All offsets validated at construction time, safe to ignore the
+		// error here.
+		// If we had to return an error to Contains(...) and Iterator() then
+		// we wouldn't be able to implement the API contract.
+		// Today we also have this same issue with existing mmap backed roaring
+		// bitmaps from pilosa, so it doesn't reduce or expand our risk exposure.
+		container, _ := b.containerAtIndex(i)
+		l += int(container.cardinality)
 	}
 	return l
 }
@@ -336,6 +407,7 @@ var _ postings.Iterator = (*readOnlyBitmapIterator)(nil)
 
 type readOnlyBitmapIterator struct {
 	b                  *ReadOnlyBitmap
+	err                error
 	containerIndex     int
 	containerExhausted bool
 	container          readOnlyContainer
@@ -386,7 +458,7 @@ func (i *readOnlyBitmapIterator) setContainer(c readOnlyContainer) {
 }
 
 func (i *readOnlyBitmapIterator) Next() bool {
-	if i.containerIndex >= int(i.b.keyN) {
+	if i.err != nil || i.containerIndex >= int(i.b.keyN) {
 		// Already exhausted.
 		return false
 	}
@@ -397,8 +469,16 @@ func (i *readOnlyBitmapIterator) Next() bool {
 		if i.containerIndex >= int(i.b.keyN) {
 			return false
 		}
+
 		i.containerExhausted = false
-		i.setContainer(i.b.containerAtIndex(uint64(i.containerIndex)))
+
+		container, err := i.b.containerAtIndex(uint64(i.containerIndex))
+		if err != nil {
+			i.err = err
+			return false
+		}
+
+		i.setContainer(container)
 	}
 
 	if i.container.containerType == containerBitmap {
@@ -489,6 +569,7 @@ var _ containerIterator = (*readOnlyBitmapContainerIterator)(nil)
 
 type readOnlyBitmapContainerIterator struct {
 	b              *ReadOnlyBitmap
+	err            error
 	containerIndex int
 	container      readOnlyContainer
 }
@@ -503,11 +584,22 @@ func newReadOnlyBitmapContainerIterator(
 }
 
 func (i *readOnlyBitmapContainerIterator) NextContainer() bool {
+	if i.err != nil && i.containerIndex >= int(i.b.keyN) {
+		return false
+	}
+
 	i.containerIndex++
 	if i.containerIndex >= int(i.b.keyN) {
 		return false
 	}
-	i.container = i.b.containerAtIndex(uint64(i.containerIndex))
+
+	container, err := i.b.containerAtIndex(uint64(i.containerIndex))
+	if err != nil {
+		i.err = err
+		return false
+	}
+
+	i.container = container
 	return true
 }
 
@@ -624,7 +716,12 @@ func (i *readOnlyBitmapContainerIterator) ContainerNegate(
 	}
 }
 
-const maxBitmap = 0xFFFFFFFFFFFFFFFF
+func (i *readOnlyBitmapContainerIterator) Err() error {
+	return i.err
+}
+
+func (i *readOnlyBitmapContainerIterator) Close() {
+}
 
 // bitmapSetRange sets all bits in [i, j) the same as pilosa's
 // bitmapSetRangeIgnoreN.
@@ -824,3 +921,10 @@ func (i *readOnlyBitmapRangeContainerIterator) ContainerNegate(
 	bitmapSetRange(ctx.tempBitmap.bitmap, start, end+1)
 	differenceBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
 }
+
+func (i *readOnlyBitmapRangeContainerIterator) Err() error {
+	return nil
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) Close() {
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly_test.go b/src/m3ninx/postings/roaring/bitmap_readonly_test.go
index 9db96e1307..4b86870e6b 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly_test.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly_test.go
@@ -99,6 +99,9 @@ func TestReadOnlyBitmap(t *testing.T) {
 
 				list := NewPostingsListFromBitmap(b)
 
+				// Note: Do not reuse buffer before done with
+				// read only map that is backed by the bytes from the
+				// bufer.
 				buff.Reset()
 				_, err := b.WriteTo(buff)
 				require.NoError(t, err)
diff --git a/src/m3ninx/postings/roaring/roaring_test.go b/src/m3ninx/postings/roaring/roaring_test.go
index 9ada7f122d..8380253dd6 100644
--- a/src/m3ninx/postings/roaring/roaring_test.go
+++ b/src/m3ninx/postings/roaring/roaring_test.go
@@ -32,17 +32,17 @@ import (
 func TestRoaringPostingsListEmpty(t *testing.T) {
 	d := NewPostingsList()
 	require.True(t, d.IsEmpty())
-	require.Equal(t, 0, d.Len())
+	require.Equal(t, 0, d.CountSlow())
 }
 
 func TestRoaringPostingsListInsert(t *testing.T) {
 	d := NewPostingsList()
 	require.NoError(t, d.Insert(1))
 	require.True(t, d.Contains(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 	// Idempotency of inserts.
 	require.NoError(t, d.Insert(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 	require.True(t, d.Contains(1))
 }
 
@@ -50,25 +50,25 @@ func TestRoaringPostingsListClone(t *testing.T) {
 	d := NewPostingsList()
 	require.NoError(t, d.Insert(1))
 	require.True(t, d.Contains(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 
 	c := d.Clone()
 	require.True(t, c.Contains(1))
-	require.Equal(t, 1, c.Len())
+	require.Equal(t, 1, c.CountSlow())
 
 	// Ensure only clone is uniquely backed.
 	require.NoError(t, c.Insert(2))
 	require.True(t, c.Contains(2))
-	require.Equal(t, 2, c.Len())
+	require.Equal(t, 2, c.CountSlow())
 	require.True(t, d.Contains(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 }
 
 func TestRoaringPostingsListIntersect(t *testing.T) {
 	d := NewPostingsList()
 	require.NoError(t, d.Insert(1))
 	require.True(t, d.Contains(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 
 	c := d.Clone()
 	require.True(t, c.Contains(1))
@@ -78,17 +78,17 @@ func TestRoaringPostingsListIntersect(t *testing.T) {
 
 	require.NoError(t, d.Intersect(c))
 	require.True(t, d.Contains(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 	require.True(t, c.Contains(1))
 	require.True(t, c.Contains(3))
-	require.Equal(t, 2, c.Len())
+	require.Equal(t, 2, c.CountSlow())
 }
 
 func TestRoaringPostingsListDifference(t *testing.T) {
 	d := NewPostingsList()
 	require.NoError(t, d.Insert(1))
 	require.True(t, d.Contains(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 
 	c := d.Clone()
 	require.True(t, c.Contains(1))
@@ -99,8 +99,8 @@ func TestRoaringPostingsListDifference(t *testing.T) {
 
 	require.False(t, d.Contains(1))
 	require.True(t, c.Contains(1))
-	require.Equal(t, 2, d.Len())
-	require.Equal(t, 1, c.Len())
+	require.Equal(t, 2, d.CountSlow())
+	require.Equal(t, 1, c.CountSlow())
 	require.True(t, d.Contains(3))
 	require.True(t, d.Contains(2))
 }
@@ -109,7 +109,7 @@ func TestRoaringPostingsListUnion(t *testing.T) {
 	d := NewPostingsList()
 	require.NoError(t, d.Insert(1))
 	require.True(t, d.Contains(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 
 	c := d.Clone()
 	require.True(t, c.Contains(1))
@@ -120,10 +120,10 @@ func TestRoaringPostingsListUnion(t *testing.T) {
 	require.True(t, d.Contains(1))
 	require.True(t, d.Contains(2))
 	require.True(t, d.Contains(3))
-	require.Equal(t, 3, d.Len())
+	require.Equal(t, 3, d.CountSlow())
 	require.True(t, c.Contains(1))
 	require.True(t, c.Contains(3))
-	require.Equal(t, 2, c.Len())
+	require.Equal(t, 2, c.CountSlow())
 }
 
 func TestRoaringPostingsListAddRange(t *testing.T) {
@@ -132,7 +132,7 @@ func TestRoaringPostingsListAddRange(t *testing.T) {
 	require.NoError(t, d.Insert(9))
 	require.NoError(t, d.AddRange(3, 5))
 
-	require.Equal(t, 4, d.Len())
+	require.Equal(t, 4, d.CountSlow())
 	require.True(t, d.Contains(1))
 	require.False(t, d.Contains(2))
 	require.True(t, d.Contains(3))
@@ -150,7 +150,7 @@ func TestRoaringPostingsListRemoveRange(t *testing.T) {
 	require.NoError(t, d.Insert(9))
 
 	require.NoError(t, d.RemoveRange(2, 8))
-	require.Equal(t, 3, d.Len())
+	require.Equal(t, 3, d.CountSlow())
 	require.True(t, d.Contains(1))
 	require.False(t, d.Contains(2))
 	require.False(t, d.Contains(7))
@@ -162,17 +162,17 @@ func TestRoaringPostingsListReset(t *testing.T) {
 	d := NewPostingsList()
 	require.NoError(t, d.Insert(1))
 	require.True(t, d.Contains(1))
-	require.Equal(t, 1, d.Len())
+	require.Equal(t, 1, d.CountSlow())
 	d.Reset()
 	require.True(t, d.IsEmpty())
-	require.Equal(t, 0, d.Len())
+	require.Equal(t, 0, d.CountSlow())
 }
 
 func TestRoaringPostingsListIter(t *testing.T) {
 	d := NewPostingsList()
 	require.NoError(t, d.Insert(1))
 	require.NoError(t, d.Insert(2))
-	require.Equal(t, 2, d.Len())
+	require.Equal(t, 2, d.CountSlow())
 
 	it := d.Iterator()
 	defer it.Close()
@@ -218,61 +218,61 @@ func TestRoaringPostingsListNotEqualWithOtherRoaring(t *testing.T) {
 	require.False(t, first.Equal(second))
 }
 
-func TestRoaringPostingsListEqualWithOtherNonRoaring(t *testing.T) {
-	mockCtrl := gomock.NewController(t)
-	defer mockCtrl.Finish()
-
-	first := NewPostingsList()
-	require.NoError(t, first.Insert(42))
-	require.NoError(t, first.Insert(44))
-	require.NoError(t, first.Insert(51))
-
-	postingsIter := postings.NewMockIterator(mockCtrl)
-	gomock.InOrder(
-		postingsIter.EXPECT().Next().Return(true),
-		postingsIter.EXPECT().Current().Return(postings.ID(42)),
-		postingsIter.EXPECT().Next().Return(true),
-		postingsIter.EXPECT().Current().Return(postings.ID(44)),
-		postingsIter.EXPECT().Next().Return(true),
-		postingsIter.EXPECT().Current().Return(postings.ID(51)),
-	)
-
-	second := postings.NewMockList(mockCtrl)
-	gomock.InOrder(
-		second.EXPECT().Len().Return(3),
-		second.EXPECT().Iterator().Return(postingsIter),
-	)
-
-	require.True(t, first.Equal(second))
-}
-
-func TestRoaringPostingsListNotEqualWithOtherNonRoaring(t *testing.T) {
-	mockCtrl := gomock.NewController(t)
-	defer mockCtrl.Finish()
-
-	first := NewPostingsList()
-	require.NoError(t, first.Insert(42))
-	require.NoError(t, first.Insert(44))
-	require.NoError(t, first.Insert(51))
-
-	postingsIter := postings.NewMockIterator(mockCtrl)
-	gomock.InOrder(
-		postingsIter.EXPECT().Next().Return(true),
-		postingsIter.EXPECT().Current().Return(postings.ID(42)),
-		postingsIter.EXPECT().Next().Return(true),
-		postingsIter.EXPECT().Current().Return(postings.ID(44)),
-		postingsIter.EXPECT().Next().Return(true),
-		postingsIter.EXPECT().Current().Return(postings.ID(53)),
-	)
-
-	second := postings.NewMockList(mockCtrl)
-	gomock.InOrder(
-		second.EXPECT().Len().Return(3),
-		second.EXPECT().Iterator().Return(postingsIter),
-	)
-
-	require.False(t, first.Equal(second))
-}
+// func TestRoaringPostingsListEqualWithOtherNonRoaring(t *testing.T) {
+// 	mockCtrl := gomock.NewController(t)
+// 	defer mockCtrl.Finish()
+
+// 	first := NewPostingsList()
+// 	require.NoError(t, first.Insert(42))
+// 	require.NoError(t, first.Insert(44))
+// 	require.NoError(t, first.Insert(51))
+
+// 	postingsIter := postings.NewMockIterator(mockCtrl)
+// 	gomock.InOrder(
+// 		postingsIter.EXPECT().Next().Return(true),
+// 		postingsIter.EXPECT().Current().Return(postings.ID(42)),
+// 		postingsIter.EXPECT().Next().Return(true),
+// 		postingsIter.EXPECT().Current().Return(postings.ID(44)),
+// 		postingsIter.EXPECT().Next().Return(true),
+// 		postingsIter.EXPECT().Current().Return(postings.ID(51)),
+// 	)
+
+// 	second := postings.NewMockList(mockCtrl)
+// 	gomock.InOrder(
+// 		second.EXPECT().CountSlow().Return(3),
+// 		second.EXPECT().Iterator().Return(postingsIter),
+// 	)
+
+// 	require.True(t, first.Equal(second))
+// }
+
+// func TestRoaringPostingsListNotEqualWithOtherNonRoaring(t *testing.T) {
+// 	mockCtrl := gomock.NewController(t)
+// 	defer mockCtrl.Finish()
+
+// 	first := NewPostingsList()
+// 	require.NoError(t, first.Insert(42))
+// 	require.NoError(t, first.Insert(44))
+// 	require.NoError(t, first.Insert(51))
+
+// 	postingsIter := postings.NewMockIterator(mockCtrl)
+// 	gomock.InOrder(
+// 		postingsIter.EXPECT().Next().Return(true),
+// 		postingsIter.EXPECT().Current().Return(postings.ID(42)),
+// 		postingsIter.EXPECT().Next().Return(true),
+// 		postingsIter.EXPECT().Current().Return(postings.ID(44)),
+// 		postingsIter.EXPECT().Next().Return(true),
+// 		postingsIter.EXPECT().Current().Return(postings.ID(53)),
+// 	)
+
+// 	second := postings.NewMockList(mockCtrl)
+// 	gomock.InOrder(
+// 		second.EXPECT().CountSlow().Return(3),
+// 		second.EXPECT().Iterator().Return(postingsIter),
+// 	)
+
+// 	require.False(t, first.Equal(second))
+// }
 
 func TestRoaringPostingsAddIterator(t *testing.T) {
 	mockCtrl := gomock.NewController(t)
@@ -294,7 +294,7 @@ func TestRoaringPostingsAddIterator(t *testing.T) {
 	)
 
 	require.NoError(t, first.AddIterator(postingsIter))
-	require.Equal(t, 3, first.Len())
+	require.Equal(t, 3, first.CountSlow())
 	require.True(t, first.Contains(postings.ID(42)))
 	require.True(t, first.Contains(postings.ID(44)))
 	require.True(t, first.Contains(postings.ID(51)))
diff --git a/src/m3ninx/search/proptest/concurrent_test.go b/src/m3ninx/search/proptest/concurrent_test.go
index 9d728ab3a1..f1cd5a413f 100644
--- a/src/m3ninx/search/proptest/concurrent_test.go
+++ b/src/m3ninx/search/proptest/concurrent_test.go
@@ -21,11 +21,11 @@
 package proptest
 
 import (
+	"fmt"
 	"math/rand"
 	"os"
 	"sync"
 	"testing"
-	"time"
 
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst"
@@ -39,16 +39,17 @@ import (
 
 func TestConcurrentQueries(t *testing.T) {
 	parameters := gopter.DefaultTestParameters()
-	seed := time.Now().UnixNano()
-	parameters.MinSuccessfulTests = 100
+	// seed := time.Now().UnixNano()
+	seed := int64(1603711252461848000)
+	parameters.MinSuccessfulTests = 1000
 	parameters.MaxSize = 20
 	parameters.Rng = rand.New(rand.NewSource(seed))
 	properties := gopter.NewProperties(parameters)
 
 	simpleSeg := newTestMemSegment(t, lotsTestDocuments)
-	simpleReader, err := simpleSeg.Reader()
-	require.NoError(t, err)
-	simpleExec := executor.NewExecutor([]index.Reader{simpleReader})
+	// simpleReader, err := simpleSeg.Reader()
+	// require.NoError(t, err)
+	// simpleExec := executor.NewExecutor([]index.Reader{simpleReader})
 
 	fstSeg := fst.ToTestSegment(t, simpleSeg, fstOptions)
 	fstReader, err := fstSeg.Reader()
@@ -57,10 +58,10 @@ func TestConcurrentQueries(t *testing.T) {
 
 	properties.Property("Any concurrent queries segments does not affect fst segments", prop.ForAll(
 		func(q search.Query) (bool, error) {
-			dOrg, err := simpleExec.Execute(q)
+			dOrg, err := fstExec.Execute(q)
 			require.NoError(t, err)
 			matchedDocs, err := collectDocs(dOrg)
-			require.NoError(t, err)
+			require.NoError(t, err, fmt.Sprintf("query: %v\n", q.String()))
 			docMatcher, err := newDocumentIteratorMatcher(matchedDocs...)
 			require.NoError(t, err)
 
diff --git a/src/m3ninx/search/searcher/conjunction.go b/src/m3ninx/search/searcher/conjunction.go
index ac5345b777..9719900a4c 100644
--- a/src/m3ninx/search/searcher/conjunction.go
+++ b/src/m3ninx/search/searcher/conjunction.go
@@ -21,8 +21,6 @@
 package searcher
 
 import (
-	"fmt"
-
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
@@ -57,9 +55,6 @@ func (s *conjunctionSearcher) Search(r index.Reader) (postings.List, error) {
 		if err != nil {
 			return nil, err
 		}
-		if pl == nil {
-			return nil, fmt.Errorf("conjunction searchers must resolve postings lists")
-		}
 
 		intersects = append(intersects, pl)
 	}
@@ -69,9 +64,6 @@ func (s *conjunctionSearcher) Search(r index.Reader) (postings.List, error) {
 		if err != nil {
 			return nil, err
 		}
-		if pl == nil {
-			return nil, fmt.Errorf("conjunction searchers must resolve postings lists")
-		}
 
 		negations = append(negations, pl)
 	}

From a76c6c4df078203fb3c5163ce48a80a94bab0a2b Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 28 Oct 2020 01:57:09 -0400
Subject: [PATCH 007/106] Fix tests

---
 src/m3ninx/postings/compare.go                |   3 +-
 ...only_multi.go => bitmap_multi_readonly.go} | 107 +++++--
 .../roaring/bitmap_multi_readonly_test.go     | 288 ++++++++++++++++++
 .../postings/roaring/bitmap_readonly.go       | 169 ++++++++--
 .../postings/roaring/bitmap_readonly_test.go  |   3 +
 5 files changed, 521 insertions(+), 49 deletions(-)
 rename src/m3ninx/postings/roaring/{bitmap_readonly_multi.go => bitmap_multi_readonly.go} (90%)
 create mode 100644 src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go

diff --git a/src/m3ninx/postings/compare.go b/src/m3ninx/postings/compare.go
index ceb9329e60..561bc3f773 100644
--- a/src/m3ninx/postings/compare.go
+++ b/src/m3ninx/postings/compare.go
@@ -45,7 +45,8 @@ func Equal(a, b List) bool {
 		if !otherIter.Next() {
 			return false
 		}
-		if iter.Current() != otherIter.Current() {
+		curr, otherCurr := iter.Current(), otherIter.Current()
+		if curr != otherCurr {
 			return false
 		}
 	}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly_multi.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
similarity index 90%
rename from src/m3ninx/postings/roaring/bitmap_readonly_multi.go
rename to src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 84bcd30c37..85ed8544c4 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly_multi.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -45,8 +45,9 @@ func UnionReadOnly(unions []postings.List) (postings.List, error) {
 		}
 
 		mb, ok := elem.(*multiBitmap)
-		if !ok {
+		if ok {
 			union = append(union, multiBitmapIterable{multiBitmap: mb})
+			continue
 		}
 
 		return nil, ErrNotReadOnlyBitmaps
@@ -72,8 +73,9 @@ func IntersectAndNegateReadOnly(
 		}
 
 		mb, ok := elem.(*multiBitmap)
-		if !ok {
+		if ok {
 			intersect = append(intersect, multiBitmapIterable{multiBitmap: mb})
+			continue
 		}
 
 		return nil, ErrNotReadOnlyBitmaps
@@ -88,8 +90,9 @@ func IntersectAndNegateReadOnly(
 		}
 
 		mb, ok := elem.(*multiBitmap)
-		if !ok {
+		if ok {
 			negate = append(negate, multiBitmapIterable{multiBitmap: mb})
+			continue
 		}
 
 		return nil, ErrNotReadOnlyBitmaps
@@ -156,7 +159,10 @@ type multiBitmapOptions struct {
 }
 
 func (o multiBitmapOptions) validate() error {
-	return o.op.validate()
+	if err := o.op.validate(); err != nil {
+		return err
+	}
+	return nil
 }
 
 func newMultiBitmap(opts multiBitmapOptions) (*multiBitmap, error) {
@@ -229,6 +235,8 @@ var _ postings.Iterator = (*multiBitmapIterator)(nil)
 type multiBitmapIterator struct {
 	multiBitmapOptions
 
+	err                error
+	initial            []containerIteratorAndOp
 	iters              []containerIteratorAndOp
 	filtered           []containerIteratorAndOp
 	multiContainerIter multiBitmapContainerIterator
@@ -256,6 +264,8 @@ type containerIterator interface {
 	ContainerUnion(ctx containerOpContext, target *bitmapContainer)
 	ContainerIntersect(ctx containerOpContext, target *bitmapContainer)
 	ContainerNegate(ctx containerOpContext, target *bitmapContainer)
+	Err() error
+	Close()
 }
 
 type containerOpContext struct {
@@ -280,6 +290,7 @@ func newMultiBitmapIterator(
 	iters = appendContainerItersWithOp(iters, opts.intersectNegate, multiContainerOpNegate)
 	i := &multiBitmapIterator{
 		multiBitmapOptions: opts,
+		initial:            iters,
 		iters:              iters,
 		bitmap:             getBitmapContainer(),
 		tempBitmap:         getBitmapContainer(),
@@ -316,10 +327,21 @@ func appendContainerItersWithOp(
 }
 
 func (i *multiBitmapIterator) Next() bool {
+	if i.err != nil {
+		return false
+	}
+
 	for !i.bitmapIter.Next() {
 		// Reset to next containers.
-		var ok bool
-		i.iters, ok = i.multiContainerIter.resetAndReturnValid(i.iters)
+		var (
+			ok  bool
+			err error
+		)
+		i.iters, ok, err = i.multiContainerIter.resetAndReturnValid(i.iters)
+		if err != nil {
+			i.err = err
+			return false
+		}
 		if !ok {
 			// Entirely exhausted valid iterators.
 			return false
@@ -341,20 +363,36 @@ func (i *multiBitmapIterator) Next() bool {
 				iter.it.ContainerUnion(ctx, i.bitmap)
 			}
 		case multiBitmapOpIntersect:
+			totalIntersect := len(i.filter(i.initial, multiContainerOpIntersect))
+			currIntersect := len(i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect))
+
+			// NB(r): Only intersect if all iterators have a container, otherwise
+			// there is zero overlap and so intersecting always results in
+			// no results for this container.
+			if totalIntersect != currIntersect {
+				continue
+			}
+
+			if currIntersect == 0 {
+				// No intersections so only possible negations of nothing.
+				continue
+			}
+
 			// Start bitmap as set, guaranteed to have one intersect call.
 			i.bitmap.Reset(true)
 
-			intersects := i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect)
-			negates := i.filter(i.multiContainerIter.containerIters, multiContainerOpNegate)
+			currNegate := len(i.filter(i.multiContainerIter.containerIters, multiContainerOpNegate))
 			ctx := containerOpContext{
-				siblings:   len(intersects) + len(negates) - 1,
+				siblings:   currIntersect + currNegate - 1,
 				tempBitmap: i.tempBitmap,
 			}
 			// Perform intersects.
+			intersects := i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect)
 			for _, iter := range intersects {
 				iter.it.ContainerIntersect(ctx, i.bitmap)
 			}
 			// Now perform negations.
+			negates := i.filter(i.multiContainerIter.containerIters, multiContainerOpNegate)
 			for _, iter := range negates {
 				iter.it.ContainerNegate(ctx, i.bitmap)
 			}
@@ -392,10 +430,15 @@ func (i *multiBitmapIterator) Current() postings.ID {
 }
 
 func (i *multiBitmapIterator) Err() error {
-	return nil
+	return i.err
 }
 
 func (i *multiBitmapIterator) Close() error {
+	// Close any iters that are left if we abort early.
+	for _, iter := range i.iters {
+		iter.it.Close()
+	}
+
 	// Return bitmaps to pool.
 	putBitmapContainer(i.bitmap)
 	i.bitmap = nil
@@ -415,7 +458,7 @@ type multiBitmapContainerIterator struct {
 
 func (i *multiBitmapContainerIterator) resetAndReturnValid(
 	input []containerIteratorAndOp,
-) ([]containerIteratorAndOp, bool) {
+) ([]containerIteratorAndOp, bool, error) {
 	// Reset current state.
 	i.containerIters = i.containerIters[:0]
 
@@ -429,9 +472,15 @@ func (i *multiBitmapContainerIterator) resetAndReturnValid(
 		if i.hasPrevContainerKey && iterContainerKey == i.containerKey {
 			// Consequent iteration, bump to next container as needs to progress.
 			if !iter.it.NextContainer() {
-				// Don't include.
+				// Don't include, exhausted.
+				err := iter.it.Err()
+				iter.it.Close() // Always close
+				if err != nil {
+					return nil, false, err
+				}
 				continue
 			}
+
 			// Get next container key.
 			iterContainerKey = iter.it.ContainerKey()
 		}
@@ -452,7 +501,7 @@ func (i *multiBitmapContainerIterator) resetAndReturnValid(
 	i.containerKey = nextContainerKey
 	i.hasPrevContainerKey = true
 
-	return valid, len(valid) > 0
+	return valid, len(valid) > 0, nil
 }
 
 var _ containerIterator = (*multiBitmapContainersIterator)(nil)
@@ -460,6 +509,7 @@ var _ containerIterator = (*multiBitmapContainersIterator)(nil)
 type multiBitmapContainersIterator struct {
 	multiBitmapOptions
 
+	err                error
 	iters              []containerIteratorAndOp
 	multiContainerIter multiBitmapContainerIterator
 	first              bool
@@ -481,7 +531,7 @@ func newMultiBitmapContainersIterator(
 }
 
 func (i *multiBitmapContainersIterator) NextContainer() bool {
-	if len(i.iters) != 0 {
+	if i.err != nil || len(i.iters) != 0 {
 		// Exhausted.
 		return true
 	}
@@ -493,8 +543,15 @@ func (i *multiBitmapContainersIterator) NextContainer() bool {
 		return true
 	}
 
-	var ok bool
-	i.iters, ok = i.multiContainerIter.resetAndReturnValid(i.iters)
+	var (
+		ok  bool
+		err error
+	)
+	i.iters, ok, err = i.multiContainerIter.resetAndReturnValid(i.iters)
+	if err != nil {
+		i.err = err
+		return false
+	}
 	if !ok {
 		// Exhausted.
 		return false
@@ -583,6 +640,13 @@ func (i *multiBitmapContainersIterator) ContainerNegate(
 	}
 }
 
+func (i *multiBitmapContainersIterator) Err() error {
+	return i.err
+}
+
+func (i *multiBitmapContainersIterator) Close() {
+}
+
 func (i *multiBitmapContainersIterator) getTempUnion(
 	ctx containerOpContext,
 ) *bitmapContainer {
@@ -650,17 +714,18 @@ func newBitmapContainer() *bitmapContainer {
 func (b *bitmapContainer) Reset(set bool) {
 	if !set {
 		// Make sure "0" is the default value allocated here
-		// so this is compiled into a memset optimization.
+		// so this is compiled into a memclr optimization.
+		// https://codereview.appspot.com/137880043
 		for i := range b.allocated {
 			b.allocated[i] = 0
 		}
 	} else {
 		// Manually unroll loop to make it a little faster.
 		for i := 0; i < bitmapN; i += 4 {
-			b.allocated[i] = 1
-			b.allocated[i+1] = 1
-			b.allocated[i+2] = 1
-			b.allocated[i+3] = 1
+			b.allocated[i] = maxBitmap
+			b.allocated[i+1] = maxBitmap
+			b.allocated[i+2] = maxBitmap
+			b.allocated[i+3] = maxBitmap
 		}
 	}
 
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
new file mode 100644
index 0000000000..d23753bd1a
--- /dev/null
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
@@ -0,0 +1,288 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package roaring
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path"
+	"testing"
+
+	"github.com/m3db/m3/src/m3ninx/postings"
+
+	"github.com/m3dbx/pilosa/roaring"
+	"github.com/stretchr/testify/require"
+)
+
+func TestMultiBitmap(t *testing.T) {
+	rng := rand.New(rand.NewSource(seed))
+
+	each := 8
+	numRegular := 2
+	// numUnion := 2
+	// numNegate := 1
+	// numNegateUnion := 2
+	numUnion := 0
+	numNegate := 0
+	numNegateUnion := 0
+	tests := []struct {
+		attempts    int
+		insertCount int
+		insertRange int
+	}{
+		// 64 inserts
+		{
+			insertCount: 64,
+			insertRange: 64,
+		},
+		{
+			insertCount: 64,
+			insertRange: 128,
+		},
+		{
+			insertCount: 64,
+			insertRange: 256,
+		},
+		// 4096 inserts
+		{
+			insertCount: 4096,
+			insertRange: 4096,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 8192,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 16384,
+		},
+		// 65536 inserts
+		{
+			insertCount: 65536,
+			insertRange: 65536,
+		},
+		{
+			insertCount: 65536,
+			insertRange: 131072,
+		},
+		{
+			insertCount: 4096,
+			insertRange: 262144,
+		},
+	}
+
+	for _, test := range tests {
+		genOpts := genRandBitmapAndReadOnlyBitmapOptions{
+			rng:         rng,
+			insertRange: test.insertRange,
+			insertCount: test.insertCount,
+		}
+		for i := 0; i < each; i++ {
+			t.Run(fmt.Sprintf("attempt=%d, test=+%v", i, test), func(t *testing.T) {
+				allReadOnly, err := NewReadOnlyBitmapRange(0, uint64(test.insertRange+1))
+				require.NoError(t, err)
+
+				reg, regReadOnly := genRandBitmapsAndReadOnlyBitmaps(t, numRegular, genOpts)
+				union, unionReadOnly := genRandBitmapsAndReadOnlyBitmaps(t, numUnion, genOpts)
+				negate, negateReadOnly := genRandBitmapsAndReadOnlyBitmaps(t, numNegate, genOpts)
+				negateUnion, negateUnionReadOnly := genRandBitmapsAndReadOnlyBitmaps(t, numNegateUnion, genOpts)
+
+				// First create the inner multi-bitmaps.
+				multiInner := concat(regReadOnly)
+
+				if numUnion > 0 {
+					innerUnion, err := UnionReadOnly(unionReadOnly)
+					require.NoError(t, err)
+					multiInner = append(multiInner, innerUnion)
+				}
+
+				if numNegate > 0 {
+					innerNegate, err := IntersectAndNegateReadOnly(lists(allReadOnly), negateReadOnly)
+					require.NoError(t, err)
+					multiInner = append(multiInner, innerNegate)
+				}
+
+				if numNegateUnion > 0 {
+					innerNegateUnionUnion, err := UnionReadOnly(negateUnionReadOnly)
+					require.NoError(t, err)
+					innerNegateUnion, err := IntersectAndNegateReadOnly(lists(allReadOnly), lists(innerNegateUnionUnion))
+					require.NoError(t, err)
+					multiInner = append(multiInner, innerNegateUnion)
+				}
+
+				// Create top level multi-bitmap.
+				multi, err := IntersectAndNegateReadOnly(multiInner, nil)
+				require.NoError(t, err)
+
+				// Perform same operations the old way with postings lists.
+				bitmap := roaring.NewBitmap()
+				// Make sure at least some regular postings lists are being
+				// intersected, otherwise starting with all bitmap won't be
+				// useful.
+				require.True(t, len(reg) > 0)
+				// First set all bits in the range.
+				bitmap = bitmap.Flip(0, uint64(test.insertRange))
+				// Intersect with regular bitmaps now.
+				for _, pl := range reg {
+					bitmap = bitmap.Intersect(bitmapFromPostings(t, pl))
+				}
+				// Intersect with union.
+				if numUnion > 0 {
+					pl, err := Union(union)
+					require.NoError(t, err)
+					bitmap = bitmap.Intersect(bitmapFromPostings(t, pl))
+				}
+				// Intersect with negate.
+				if numNegate > 0 {
+					for _, pl := range negate {
+						bitmap = bitmap.Difference(bitmapFromPostings(t, pl))
+					}
+				}
+				// Intersect with negate of union.
+				if numNegateUnion > 0 {
+					pl, err := Union(negateUnion)
+					require.NoError(t, err)
+					bitmap = bitmap.Difference(bitmapFromPostings(t, pl))
+				}
+				transformed := NewPostingsListFromBitmap(bitmap)
+
+				// Check for equality.
+				equal := postings.Equal(multi, transformed)
+				if !equal {
+					msg := fmt.Sprintf("multi-bitmap: %s\nstandard: %s\n",
+						postingsString(multi), postingsString(transformed))
+					if debug := os.Getenv("TEST_DEBUG_DIR"); debug != "" {
+						e0 := ioutil.WriteFile(path.Join(debug, "actual.json"), []byte(postingsJSON(t, multi)), 0666)
+						e1 := ioutil.WriteFile(path.Join(debug, "expected.json"), []byte(postingsJSON(t, transformed)), 0666)
+						require.NoError(t, e0)
+						require.NoError(t, e1)
+						msg += fmt.Sprintf("wrote debug: %s\n", debug)
+					}
+					require.True(t, equal, msg)
+				}
+
+				// Check for contains.
+				// iter := transformed.Iterator()
+				// for iter.Next() {
+				// 	curr := iter.Current()
+				// 	require.True(t, multi.Contains(curr))
+				// }
+				// require.NoError(t, iter.Err())
+				// require.NoError(t, iter.Close())
+			})
+		}
+	}
+}
+
+func bitmapFromPostings(t *testing.T, pl postings.List) *roaring.Bitmap {
+	b, ok := BitmapFromPostingsList(pl)
+	require.True(t, ok)
+	return b
+}
+
+func lists(list ...postings.List) []postings.List {
+	return list
+}
+
+func concat(lists ...[]postings.List) []postings.List {
+	var result []postings.List
+	for _, list := range lists {
+		result = append(result, list...)
+	}
+	return result
+}
+
+func genRandBitmapsAndReadOnlyBitmaps(
+	t *testing.T,
+	count int,
+	opts genRandBitmapAndReadOnlyBitmapOptions,
+) ([]postings.List, []postings.List) {
+	var regular, readOnlys []postings.List
+	for i := 0; i < count; i++ {
+		list, readOnly := genRandBitmapAndReadOnlyBitmap(t, opts)
+		regular = append(regular, list)
+		readOnlys = append(readOnlys, readOnly)
+	}
+	return regular, readOnlys
+}
+
+type genRandBitmapAndReadOnlyBitmapOptions struct {
+	rng         *rand.Rand
+	insertRange int
+	insertCount int
+}
+
+func genRandBitmapAndReadOnlyBitmap(
+	t *testing.T,
+	opts genRandBitmapAndReadOnlyBitmapOptions,
+) (postings.List, *ReadOnlyBitmap) {
+	// Note: do not reuse bitmap since we return postings list which
+	// references it.
+	bitmap := roaring.NewBitmap()
+
+	max := uint64(opts.rng.Int63n(int64(opts.insertRange)))
+	for j := 0; j < opts.insertCount; j++ {
+		value := opts.rng.Uint64() % max
+		bitmap.DirectAdd(value)
+	}
+
+	list := NewPostingsListFromBitmap(bitmap)
+
+	// Note: do not reuse buffer since read only bitmap
+	// references them.
+	buff := bytes.NewBuffer(nil)
+	_, err := bitmap.WriteTo(buff)
+	require.NoError(t, err)
+
+	readOnly, err := NewReadOnlyBitmap(buff.Bytes())
+	require.NoError(t, err)
+
+	return list, readOnly
+}
+
+func postingsString(pl postings.List) string {
+	var buf bytes.Buffer
+	iter := pl.Iterator()
+	for i := 0; iter.Next(); i++ {
+		if i != 0 {
+			buf.WriteString(", ")
+		}
+		buf.WriteString(fmt.Sprintf("%d", iter.Current()))
+	}
+	return "[" + buf.String() + "]"
+}
+
+func postingsJSON(t *testing.T, pl postings.List) string {
+	var out []uint64
+	iter := pl.Iterator()
+	for i := 0; iter.Next(); i++ {
+		out = append(out, uint64(iter.Current()))
+	}
+	require.NoError(t, iter.Err())
+	require.NoError(t, iter.Close())
+	data, err := json.MarshalIndent(out, "", "  ")
+	require.NoError(t, err)
+	return string(data)
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index f3e1989d1a..c648c96a18 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -39,6 +39,7 @@ const (
 	bitmapN            = 1024
 	runCountHeaderSize = uint32(2)
 	containerValues    = 2 << 15 // 2^16 or 65k
+	maxBitmap          = 0xFFFFFFFFFFFFFFFF
 )
 
 var (
@@ -54,6 +55,19 @@ const (
 	containerRun
 )
 
+func (t containerType) String() string {
+	switch t {
+	case containerArray:
+		return "array"
+	case containerBitmap:
+		return "bitmap"
+	case containerRun:
+		return "run"
+	default:
+		return "unknown"
+	}
+}
+
 var _ postings.List = (*ReadOnlyBitmap)(nil)
 
 // ReadOnlyBitmap is a read only roaring Bitmap of
@@ -101,8 +115,8 @@ func (b *ReadOnlyBitmap) Reset(data []byte) error {
 	b.rangeStartInclusive = 0
 	b.rangeEndExclusive = 0
 
+	// Reset to nil.
 	if len(data) == 0 {
-		// Reset to nil
 		b.data = nil
 		b.keyN = 0
 		return nil
@@ -126,16 +140,15 @@ func (b *ReadOnlyBitmap) Reset(data []byte) error {
 	}
 
 	// Read key count in bytes sizeof(cookie):(sizeof(cookie)+sizeof(uint32)).
-	keyN := uint64(binary.LittleEndian.Uint32(data[4:8]))
+	b.keyN = uint64(binary.LittleEndian.Uint32(data[4:8]))
+	b.data = data
 
-	minBytesN := headerBaseSize + keyN*12 + keyN*4
-	if uint64(len(data)) < minBytesN {
-		return fmt.Errorf("bitmap too small: need=%d, actual=%d",
-			minBytesN, len(data))
+	// Validate all the containers.
+	for i := uint64(0); i < b.keyN; i++ {
+		if _, err := b.containerAtIndex(i); err != nil {
+			return err
+		}
 	}
-
-	b.data = data
-	b.keyN = keyN
 	return nil
 }
 
@@ -179,12 +192,45 @@ func (r runReadOnlyContainer) contains(v uint16) bool {
 	return idx < n && v >= r.values[idx].start && v <= r.values[idx].last
 }
 
+func (c readOnlyContainer) validate() error {
+	switch c.containerType {
+	case containerBitmap:
+		need := int(c.offset) + 8*bitmapN // entry uint64 bitmap 8 bytes
+		if len(c.data) < need {
+			return fmt.Errorf("data too small for bitmap: needs=%d, actual=%d",
+				need, len(c.data))
+		}
+		return nil
+	case containerArray:
+		need := int(c.offset) + 2*int(c.cardinality) // entry is uint16 2 bytes
+		if len(c.data) < need {
+			return fmt.Errorf("data too small for array: needs=%d, actual=%d",
+				need, len(c.data))
+		}
+		return nil
+	case containerRun:
+		need := int(c.offset) + int(runCountHeaderSize)
+		if len(c.data) < need {
+			return fmt.Errorf("data too small for runs header: needs=%d, actual=%d",
+				need, len(c.data))
+		}
+		runCount := binary.LittleEndian.Uint16(c.data[c.offset : c.offset+runCountHeaderSize])
+		need = int(c.offset) + int(runCountHeaderSize) + 4*int(runCount) // entry is two uint16s 4 bytes
+		if len(c.data) < need {
+			return fmt.Errorf("data too small for runs values: needs=%d, actual=%d",
+				need, len(c.data))
+		}
+		return nil
+	}
+	return fmt.Errorf("unknown container: %d", c.containerType)
+}
+
 func (c readOnlyContainer) bitmap() (bitmapReadOnlyContainer, bool) {
 	if c.containerType != containerBitmap {
 		return bitmapReadOnlyContainer{}, false
 	}
 	return bitmapReadOnlyContainer{
-		values: (*[0xFFFFFFF]uint64)(unsafe.Pointer(&c.data[c.offset]))[:bitmapN:bitmapN],
+		values: (*[0xFFFFFFF]uint64)(unsafe.Pointer(&c.data[c.offset]))[:bitmapN],
 	}, true
 }
 
@@ -193,7 +239,7 @@ func (c readOnlyContainer) array() (arrayReadOnlyContainer, bool) {
 		return arrayReadOnlyContainer{}, false
 	}
 	return arrayReadOnlyContainer{
-		values: (*[0xFFFFFFF]uint16)(unsafe.Pointer(&c.data[c.offset]))[:c.cardinality:c.cardinality],
+		values: (*[0xFFFFFFF]uint16)(unsafe.Pointer(&c.data[c.offset]))[:c.cardinality],
 	}, true
 }
 
@@ -203,7 +249,7 @@ func (c readOnlyContainer) runs() (runReadOnlyContainer, bool) {
 	}
 	runCount := binary.LittleEndian.Uint16(c.data[c.offset : c.offset+runCountHeaderSize])
 	return runReadOnlyContainer{
-		values: (*[0xFFFFFFF]interval16)(unsafe.Pointer(&c.data[c.offset+runCountHeaderSize]))[:runCount:runCount],
+		values: (*[0xFFFFFFF]interval16)(unsafe.Pointer(&c.data[c.offset+runCountHeaderSize]))[:runCount],
 	}, true
 }
 
@@ -221,19 +267,49 @@ func (b *ReadOnlyBitmap) container(key uint64) (readOnlyContainer, bool) {
 	if !ok {
 		return readOnlyContainer{}, false
 	}
-	return b.containerAtIndex(index), true
-}
-
-func (b *ReadOnlyBitmap) containerAtIndex(index uint64) readOnlyContainer {
-	meta := b.data[headerBaseSize+index*12:]
-	offsets := b.data[headerBaseSize+b.keyN*12+index*4:]
-	return readOnlyContainer{
+	// All offsets validated at construction time, safe to ignore the
+	// error here.
+	// If we had to return an error to Contains(...) and Iterator() then
+	// we wouldn't be able to implement the API contract.
+	// Today we also have this same issue with existing mmap backed roaring
+	// bitmaps from pilosa, so it doesn't reduce or expand our risk exposure.
+	container, _ := b.containerAtIndex(index)
+	return container, true
+}
+
+func (b *ReadOnlyBitmap) containerAtIndex(index uint64) (readOnlyContainer, error) {
+	const (
+		metaTypeStart = 8
+		metaTypeEnd   = 10
+		metaCardStart = 10
+		metaCardEnd   = 12
+		offsetStart   = 0
+		offsetEnd     = 4
+	)
+	metaIdx := headerBaseSize + index*12
+	offsetIdx := headerBaseSize + b.keyN*12 + index*4
+	size := uint64(len(b.data))
+	if size < metaIdx+metaCardEnd {
+		return readOnlyContainer{}, fmt.Errorf(
+			"data too small: need=%d, actual=%d", metaIdx+metaCardEnd, size)
+	}
+	if size < offsetIdx+offsetEnd {
+		return readOnlyContainer{}, fmt.Errorf(
+			"data too small: need=%d, actual=%d", offsetIdx+offsetEnd, size)
+	}
+	meta := b.data[metaIdx:]
+	offsets := b.data[offsetIdx:]
+	container := readOnlyContainer{
 		data:          b.data,
 		key:           b.keyAtIndex(int(index)),
-		containerType: containerType(binary.LittleEndian.Uint16(meta[8:10])),
-		cardinality:   uint16(binary.LittleEndian.Uint16(meta[10:12])) + 1,
-		offset:        binary.LittleEndian.Uint32(offsets[0:4]),
+		containerType: containerType(binary.LittleEndian.Uint16(meta[metaTypeStart:metaTypeEnd])),
+		cardinality:   uint16(binary.LittleEndian.Uint16(meta[metaCardStart:metaCardEnd])) + 1,
+		offset:        binary.LittleEndian.Uint32(offsets[offsetStart:offsetEnd]),
 	}
+	if err := container.validate(); err != nil {
+		return readOnlyContainer{}, err
+	}
+	return container, nil
 }
 
 func (b *ReadOnlyBitmap) Contains(id postings.ID) bool {
@@ -270,7 +346,14 @@ func (b *ReadOnlyBitmap) IsEmpty() bool {
 func (b *ReadOnlyBitmap) count() int {
 	l := 0
 	for i := uint64(0); i < b.keyN; i++ {
-		l += int(b.containerAtIndex(i).cardinality)
+		// All offsets validated at construction time, safe to ignore the
+		// error here.
+		// If we had to return an error to Contains(...) and Iterator() then
+		// we wouldn't be able to implement the API contract.
+		// Today we also have this same issue with existing mmap backed roaring
+		// bitmaps from pilosa, so it doesn't reduce or expand our risk exposure.
+		container, _ := b.containerAtIndex(i)
+		l += int(container.cardinality)
 	}
 	return l
 }
@@ -336,6 +419,7 @@ var _ postings.Iterator = (*readOnlyBitmapIterator)(nil)
 
 type readOnlyBitmapIterator struct {
 	b                  *ReadOnlyBitmap
+	err                error
 	containerIndex     int
 	containerExhausted bool
 	container          readOnlyContainer
@@ -386,7 +470,7 @@ func (i *readOnlyBitmapIterator) setContainer(c readOnlyContainer) {
 }
 
 func (i *readOnlyBitmapIterator) Next() bool {
-	if i.containerIndex >= int(i.b.keyN) {
+	if i.err != nil || i.containerIndex >= int(i.b.keyN) {
 		// Already exhausted.
 		return false
 	}
@@ -397,8 +481,15 @@ func (i *readOnlyBitmapIterator) Next() bool {
 		if i.containerIndex >= int(i.b.keyN) {
 			return false
 		}
+
+		container, err := i.b.containerAtIndex(uint64(i.containerIndex))
+		if err != nil {
+			i.err = err
+			return false
+		}
+
 		i.containerExhausted = false
-		i.setContainer(i.b.containerAtIndex(uint64(i.containerIndex)))
+		i.setContainer(container)
 	}
 
 	if i.container.containerType == containerBitmap {
@@ -489,6 +580,7 @@ var _ containerIterator = (*readOnlyBitmapContainerIterator)(nil)
 
 type readOnlyBitmapContainerIterator struct {
 	b              *ReadOnlyBitmap
+	err            error
 	containerIndex int
 	container      readOnlyContainer
 }
@@ -503,11 +595,22 @@ func newReadOnlyBitmapContainerIterator(
 }
 
 func (i *readOnlyBitmapContainerIterator) NextContainer() bool {
+	if i.err != nil && i.containerIndex >= int(i.b.keyN) {
+		return false
+	}
+
 	i.containerIndex++
 	if i.containerIndex >= int(i.b.keyN) {
 		return false
 	}
-	i.container = i.b.containerAtIndex(uint64(i.containerIndex))
+
+	container, err := i.b.containerAtIndex(uint64(i.containerIndex))
+	if err != nil {
+		i.err = err
+		return false
+	}
+
+	i.container = container
 	return true
 }
 
@@ -624,7 +727,12 @@ func (i *readOnlyBitmapContainerIterator) ContainerNegate(
 	}
 }
 
-const maxBitmap = 0xFFFFFFFFFFFFFFFF
+func (i *readOnlyBitmapContainerIterator) Err() error {
+	return i.err
+}
+
+func (i *readOnlyBitmapContainerIterator) Close() {
+}
 
 // bitmapSetRange sets all bits in [i, j) the same as pilosa's
 // bitmapSetRangeIgnoreN.
@@ -824,3 +932,10 @@ func (i *readOnlyBitmapRangeContainerIterator) ContainerNegate(
 	bitmapSetRange(ctx.tempBitmap.bitmap, start, end+1)
 	differenceBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
 }
+
+func (i *readOnlyBitmapRangeContainerIterator) Err() error {
+	return nil
+}
+
+func (i *readOnlyBitmapRangeContainerIterator) Close() {
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly_test.go b/src/m3ninx/postings/roaring/bitmap_readonly_test.go
index 9db96e1307..4b86870e6b 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly_test.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly_test.go
@@ -99,6 +99,9 @@ func TestReadOnlyBitmap(t *testing.T) {
 
 				list := NewPostingsListFromBitmap(b)
 
+				// Note: Do not reuse buffer before done with
+				// read only map that is backed by the bytes from the
+				// bufer.
 				buff.Reset()
 				_, err := b.WriteTo(buff)
 				require.NoError(t, err)

From a0ce92fa4a54bf0091b51f3d5e4aaed054dc417a Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 28 Oct 2020 04:51:12 -0400
Subject: [PATCH 008/106] Fix multi segments builder

---
 ...i_segments_multi_key_postings_list_iter.go | 23 ++++++++--------
 .../builder/multi_segments_terms_iter.go      | 27 ++++++++-----------
 .../index/segment/builder/terms_test.go       |  4 +--
 3 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
index 206be79fd3..a7b002f388 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
@@ -41,14 +41,12 @@ type multiKeyPostingsListIterator struct {
 	currIters             []keyIterator
 	currReaders           []index.Reader
 	currFieldPostingsList postings.MutableList
-	bitmapIter            *bitmap.Iterator
 }
 
 func newMultiKeyPostingsListIterator() *multiKeyPostingsListIterator {
 	b := bitmap.NewBitmapWithDefaultPooling(defaultBitmapContainerPooling)
 	i := &multiKeyPostingsListIterator{
 		currFieldPostingsList: roaring.NewPostingsListFromBitmap(b),
-		bitmapIter:            &bitmap.Iterator{},
 	}
 	i.reset()
 	return i
@@ -158,19 +156,12 @@ func (i *multiKeyPostingsListIterator) Next() bool {
 
 		// We have to taken into account the offset and duplicates
 		var (
-			iter           = i.bitmapIter
+			iter           = pl.Iterator()
 			duplicates     = fieldsKeyIter.segment.duplicatesAsc
 			negativeOffset postings.ID
 		)
-		bitmap, ok := roaring.BitmapFromPostingsList(pl)
-		if !ok {
-			i.err = errPostingsListNotRoaring
-			return false
-		}
-
-		iter.Reset(bitmap)
-		for v, eof := iter.Next(); !eof; v, eof = iter.Next() {
-			curr := postings.ID(v)
+		for iter.Next() {
+			curr := iter.Current()
 			for len(duplicates) > 0 && curr > duplicates[0] {
 				duplicates = duplicates[1:]
 				negativeOffset++
@@ -183,10 +174,18 @@ func (i *multiKeyPostingsListIterator) Next() bool {
 			}
 			value := curr + fieldsKeyIter.segment.offset - negativeOffset
 			if err := i.currFieldPostingsList.Insert(value); err != nil {
+				iter.Close()
 				i.err = err
 				return false
 			}
 		}
+
+		err = iter.Err()
+		iter.Close()
+		if err != nil {
+			i.err = err
+			return false
+		}
 	}
 	return true
 }
diff --git a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
index d27eb3ec8d..c4edeeedb2 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
@@ -21,8 +21,6 @@
 package builder
 
 import (
-	"errors"
-
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
@@ -34,10 +32,6 @@ const (
 	defaultBitmapContainerPooling = 128
 )
 
-var (
-	errPostingsListNotRoaring = errors.New("postings list not a roaring postings list")
-)
-
 // Ensure for our use case that the terms iter from segments we return
 // matches the signature for the terms iterator.
 var _ segment.TermsIterator = &termsIterFromSegments{}
@@ -150,19 +144,12 @@ func (i *termsIterFromSegments) Next() bool {
 
 		// We have to taken into account the offset and duplicates
 		var (
-			iter           = i.bitmapIter
+			iter           = list.Iterator()
 			duplicates     = termsKeyIter.segment.duplicatesAsc
 			negativeOffset postings.ID
 		)
-		bitmap, ok := roaring.BitmapFromPostingsList(list)
-		if !ok {
-			i.err = errPostingsListNotRoaring
-			return false
-		}
-
-		iter.Reset(bitmap)
-		for v, eof := iter.Next(); !eof; v, eof = iter.Next() {
-			curr := postings.ID(v)
+		for iter.Next() {
+			curr := iter.Current()
 			for len(duplicates) > 0 && curr > duplicates[0] {
 				duplicates = duplicates[1:]
 				negativeOffset++
@@ -175,10 +162,18 @@ func (i *termsIterFromSegments) Next() bool {
 			}
 			value := curr + termsKeyIter.segment.offset - negativeOffset
 			if err := i.currPostingsList.Insert(value); err != nil {
+				iter.Close()
 				i.err = err
 				return false
 			}
 		}
+
+		err := iter.Err()
+		iter.Close()
+		if err != nil {
+			i.err = err
+			return false
+		}
 	}
 
 	return true
diff --git a/src/m3ninx/index/segment/builder/terms_test.go b/src/m3ninx/index/segment/builder/terms_test.go
index 0a17d7a062..20397ab870 100644
--- a/src/m3ninx/index/segment/builder/terms_test.go
+++ b/src/m3ninx/index/segment/builder/terms_test.go
@@ -34,10 +34,10 @@ func TestTermsReuse(t *testing.T) {
 	require.NoError(t, terms.post([]byte("term"), postings.ID(1)))
 	require.Equal(t, terms.size(), 1)
 	require.Equal(t, terms.postings.Len(), 1)
-	require.Equal(t, terms.postingsListUnion.Len(), 1)
+	require.Equal(t, terms.postingsListUnion.CountSlow(), 1)
 
 	terms.reset()
 	require.Equal(t, terms.size(), 0)
 	require.Equal(t, terms.postings.Len(), 0)
-	require.Equal(t, terms.postingsListUnion.Len(), 0)
+	require.Equal(t, terms.postingsListUnion.CountSlow(), 0)
 }

From 6786415f4a5f0e3836930f3b5f21c47213afb801 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 28 Oct 2020 05:23:59 -0400
Subject: [PATCH 009/106] Fix fieldsAndTermsIter

---
 .../storage/index/fields_terms_iterator.go    | 20 ++++++-----
 .../postings/roaring/bitmap_multi_readonly.go | 33 +++++++++++-------
 .../postings/roaring/bitmap_readonly.go       | 34 +++++++++++++++++++
 3 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index e13d8187ff..ae1710beec 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -27,7 +27,6 @@ import (
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	xerrors "github.com/m3db/m3/src/x/errors"
-	pilosaroaring "github.com/m3dbx/pilosa/roaring"
 )
 
 var (
@@ -74,7 +73,8 @@ type fieldsAndTermsIter struct {
 		postings postings.List
 	}
 
-	restrictByPostings *pilosaroaring.Bitmap
+	restrictByPostings          *roaring.ReadOnlyBitmap
+	restrictByPostingsIntersect *roaring.ReadOnlyBitmapIntersectCheck
 }
 
 var (
@@ -89,7 +89,9 @@ type newFieldsAndTermsIteratorFn func(
 ) (fieldsAndTermsIterator, error)
 
 func newFieldsAndTermsIterator(reader segment.Reader, opts fieldsAndTermsIteratorOpts) (fieldsAndTermsIterator, error) {
-	iter := &fieldsAndTermsIter{}
+	iter := &fieldsAndTermsIter{
+		restrictByPostingsIntersect: roaring.NewReadOnlyBitmapIntersectCheck(),
+	}
 	err := iter.Reset(reader, opts)
 	if err != nil {
 		return nil, err
@@ -129,7 +131,7 @@ func (fti *fieldsAndTermsIter) Reset(reader segment.Reader, opts fieldsAndTermsI
 
 	// Hold onto the postings bitmap to intersect against on a per term basis.
 	// TODO: This will be a read only bitmap, need to update.
-	bitmap, ok := roaring.BitmapFromPostingsList(pl)
+	bitmap, ok := roaring.ReadOnlyBitmapFromPostingsList(pl)
 	if !ok {
 		return errUnpackBitmapFromPostingsList
 	}
@@ -210,17 +212,17 @@ func (fti *fieldsAndTermsIter) nextTermsIterResult() (bool, error) {
 		}
 
 		// TODO: This will be a read only bitmap, need to update.
-		bitmap, ok := roaring.BitmapFromPostingsList(fti.current.postings)
+		bitmap, ok := roaring.ReadOnlyBitmapFromPostingsList(fti.current.postings)
 		if !ok {
 			return false, errUnpackBitmapFromPostingsList
 		}
 
-		// Check term isn part of at least some of the documents we're
+		// Check term isn't part of at least some of the documents we're
 		// restricted to providing results for based on intersection
 		// count.
-		// Note: IntersectionCount is significantly faster than intersecting and
-		// counting results and also does not allocate.
-		if n := fti.restrictByPostings.IntersectionCount(bitmap); n > 0 {
+		restrictBy := fti.restrictByPostings
+		match := fti.restrictByPostingsIntersect.Intersects(restrictBy, bitmap)
+		if match {
 			// Matches, this is next result.
 			return true, nil
 		}
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 82ad54195e..01f9514e07 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -278,24 +278,31 @@ type containerOpContext struct {
 func newMultiBitmapIterator(
 	opts multiBitmapOptions,
 ) *multiBitmapIterator {
-	var (
-		n     = len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
-		iters = make([]containerIteratorAndOp, 0, n)
-	)
-	iters = appendContainerItersWithOp(iters, opts.union, multiContainerOpUnion)
-	iters = appendContainerItersWithOp(iters, opts.intersect, multiContainerOpIntersect)
-	iters = appendContainerItersWithOp(iters, opts.intersectNegate, multiContainerOpNegate)
 	i := &multiBitmapIterator{
-		multiBitmapOptions: opts,
-		initial:            iters,
-		iters:              iters,
-		bitmap:             getBitmapContainer(),
-		tempBitmap:         getBitmapContainer(),
+		bitmap:     getBitmapContainer(),
+		tempBitmap: getBitmapContainer(),
 	}
-	i.bitmapIter.Reset(0, i.bitmap)
+	i.Reset(opts)
 	return i
 }
 
+func (i *multiBitmapIterator) Reset(opts multiBitmapOptions) {
+	i.multiBitmapOptions = opts
+	n := len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
+	if i.iters == nil {
+		i.iters = make([]containerIteratorAndOp, 0, n)
+	}
+	i.iters = i.iters[:0]
+	i.iters = appendContainerItersWithOp(i.iters, opts.union, multiContainerOpUnion)
+	i.iters = appendContainerItersWithOp(i.iters, opts.intersect, multiContainerOpIntersect)
+	i.iters = appendContainerItersWithOp(i.iters, opts.intersectNegate, multiContainerOpNegate)
+	i.initial = i.iters[:]
+	i.err = nil
+	i.multiContainerIter = multiBitmapContainerIterator{}
+	i.bitmap.Reset(false)
+	i.bitmapIter.Reset(0, i.bitmap)
+}
+
 func appendContainerItersWithOp(
 	slice []containerIteratorAndOp,
 	iterables []multiBitmapIterable,
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index d08406427e..fbf596a477 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -76,6 +76,40 @@ func lowbits(v uint64) uint16 {
 	return uint16(v & 0xFFFF)
 }
 
+// ReadOnlyBitmapFromPostingsList returns a bitmap from a postings list if it
+// is a read only roaring bitmap postings list.
+func ReadOnlyBitmapFromPostingsList(pl postings.List) (*ReadOnlyBitmap, bool) {
+	result, ok := pl.(*ReadOnlyBitmap)
+	if !ok {
+		return nil, false
+	}
+	return result, true
+}
+
+// ReadOnlyBitmapIntersectCheck is a check that can be repeated
+// against read only bitmaps without allocations.
+type ReadOnlyBitmapIntersectCheck struct {
+	multiBitmapIterator *multiBitmapIterator
+	intersect           []multiBitmapIterable
+}
+
+func NewReadOnlyBitmapIntersectCheck() *ReadOnlyBitmapIntersectCheck {
+	return &ReadOnlyBitmapIntersectCheck{
+		multiBitmapIterator: newMultiBitmapIterator(multiBitmapOptions{}),
+		intersect:           make([]multiBitmapIterable, 2),
+	}
+}
+
+func (c *ReadOnlyBitmapIntersectCheck) Intersects(a, b *ReadOnlyBitmap) bool {
+	c.intersect[0] = multiBitmapIterable{bitmap: a}
+	c.intersect[1] = multiBitmapIterable{bitmap: b}
+	c.multiBitmapIterator.Reset(multiBitmapOptions{
+		op:        multiBitmapOpIntersect,
+		intersect: c.intersect,
+	})
+	return c.multiBitmapIterator.Next()
+}
+
 var _ postings.List = (*ReadOnlyBitmap)(nil)
 
 // ReadOnlyBitmap is a read only roaring Bitmap of

From dd583156fdcf302ca71493fc8778d69d5796c1ae Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 28 Oct 2020 05:44:08 -0400
Subject: [PATCH 010/106] Fix checker

---
 .../storage/index/fields_terms_iterator.go    | 24 ++---
 .../index/fields_terms_iterator_test.go       |  4 +-
 src/m3ninx/index/index_mock.go                |  6 +-
 src/m3ninx/index/segment/segment_mock.go      |  4 +-
 src/m3ninx/postings/postings_mock.go          | 98 ++++++++-----------
 .../postings/roaring/bitmap_readonly.go       | 24 ++++-
 6 files changed, 78 insertions(+), 82 deletions(-)

diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index ae1710beec..9941a5bd13 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -73,7 +73,7 @@ type fieldsAndTermsIter struct {
 		postings postings.List
 	}
 
-	restrictByPostings          *roaring.ReadOnlyBitmap
+	restrictByPostings          postings.List
 	restrictByPostingsIntersect *roaring.ReadOnlyBitmapIntersectCheck
 }
 
@@ -100,7 +100,9 @@ func newFieldsAndTermsIterator(reader segment.Reader, opts fieldsAndTermsIterato
 }
 
 func (fti *fieldsAndTermsIter) Reset(reader segment.Reader, opts fieldsAndTermsIteratorOpts) error {
+	restrictByPostingsIntersect := fti.restrictByPostingsIntersect
 	*fti = fieldsAndTermsIterZeroed
+	fti.restrictByPostingsIntersect = restrictByPostingsIntersect
 	fti.reader = reader
 	fti.opts = opts
 	if reader == nil {
@@ -130,13 +132,7 @@ func (fti *fieldsAndTermsIter) Reset(reader segment.Reader, opts fieldsAndTermsI
 	}
 
 	// Hold onto the postings bitmap to intersect against on a per term basis.
-	// TODO: This will be a read only bitmap, need to update.
-	bitmap, ok := roaring.ReadOnlyBitmapFromPostingsList(pl)
-	if !ok {
-		return errUnpackBitmapFromPostingsList
-	}
-
-	fti.restrictByPostings = bitmap
+	fti.restrictByPostings = pl
 
 	return nil
 }
@@ -211,17 +207,15 @@ func (fti *fieldsAndTermsIter) nextTermsIterResult() (bool, error) {
 			return true, nil
 		}
 
-		// TODO: This will be a read only bitmap, need to update.
-		bitmap, ok := roaring.ReadOnlyBitmapFromPostingsList(fti.current.postings)
-		if !ok {
-			return false, errUnpackBitmapFromPostingsList
-		}
-
 		// Check term isn't part of at least some of the documents we're
 		// restricted to providing results for based on intersection
 		// count.
 		restrictBy := fti.restrictByPostings
-		match := fti.restrictByPostingsIntersect.Intersects(restrictBy, bitmap)
+		curr := fti.current.postings
+		match, err := fti.restrictByPostingsIntersect.Intersects(restrictBy, curr)
+		if err != nil {
+			return false, err
+		}
 		if match {
 			// Matches, this is next result.
 			return true, nil
diff --git a/src/dbnode/storage/index/fields_terms_iterator_test.go b/src/dbnode/storage/index/fields_terms_iterator_test.go
index b1ae28c8dd..f2cdf7b629 100644
--- a/src/dbnode/storage/index/fields_terms_iterator_test.go
+++ b/src/dbnode/storage/index/fields_terms_iterator_test.go
@@ -238,7 +238,9 @@ func TestFieldsTermsIteratorIterateTermsAndRestrictByQuery(t *testing.T) {
 	colorRegexp, err := idx.NewRegexpQuery([]byte("color"), []byte("^(red|yellow)$"))
 	require.NoError(t, err)
 
-	reader, err := seg.Reader()
+	// Make sure to use fst segment so that read only bitmaps returned.
+	fstSeg := fst.ToTestSegment(t, seg, testFstOptions)
+	reader, err := fstSeg.Reader()
 	require.NoError(t, err)
 
 	iter, err := newFieldsAndTermsIterator(reader, fieldsAndTermsIteratorOpts{
diff --git a/src/m3ninx/index/index_mock.go b/src/m3ninx/index/index_mock.go
index 26320c2311..b3d41754c8 100644
--- a/src/m3ninx/index/index_mock.go
+++ b/src/m3ninx/index/index_mock.go
@@ -1,7 +1,7 @@
 // Code generated by MockGen. DO NOT EDIT.
 // Source: github.com/m3db/m3/src/m3ninx/index (interfaces: Reader,DocRetriever)
 
-// Copyright (c) 2019 Uber Technologies, Inc.
+// Copyright (c) 2020 Uber Technologies, Inc.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -116,10 +116,10 @@ func (mr *MockReaderMockRecorder) Docs(arg0 interface{}) *gomock.Call {
 }
 
 // MatchAll mocks base method
-func (m *MockReader) MatchAll() (postings.MutableList, error) {
+func (m *MockReader) MatchAll() (postings.List, error) {
 	m.ctrl.T.Helper()
 	ret := m.ctrl.Call(m, "MatchAll")
-	ret0, _ := ret[0].(postings.MutableList)
+	ret0, _ := ret[0].(postings.List)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
diff --git a/src/m3ninx/index/segment/segment_mock.go b/src/m3ninx/index/segment/segment_mock.go
index 4721f195c2..1ba82dbe59 100644
--- a/src/m3ninx/index/segment/segment_mock.go
+++ b/src/m3ninx/index/segment/segment_mock.go
@@ -242,10 +242,10 @@ func (mr *MockReaderMockRecorder) MatchRegexp(field, c interface{}) *gomock.Call
 }
 
 // MatchAll mocks base method
-func (m *MockReader) MatchAll() (postings.MutableList, error) {
+func (m *MockReader) MatchAll() (postings.List, error) {
 	m.ctrl.T.Helper()
 	ret := m.ctrl.Call(m, "MatchAll")
-	ret0, _ := ret[0].(postings.MutableList)
+	ret0, _ := ret[0].(postings.List)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
diff --git a/src/m3ninx/postings/postings_mock.go b/src/m3ninx/postings/postings_mock.go
index 037698588c..298b0d30d0 100644
--- a/src/m3ninx/postings/postings_mock.go
+++ b/src/m3ninx/postings/postings_mock.go
@@ -81,33 +81,33 @@ func (mr *MockListMockRecorder) IsEmpty() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsEmpty", reflect.TypeOf((*MockList)(nil).IsEmpty))
 }
 
-// Max mocks base method
-func (m *MockList) Max() (ID, error) {
+// CountFast mocks base method
+func (m *MockList) CountFast() (int, bool) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Max")
-	ret0, _ := ret[0].(ID)
-	ret1, _ := ret[1].(error)
+	ret := m.ctrl.Call(m, "CountFast")
+	ret0, _ := ret[0].(int)
+	ret1, _ := ret[1].(bool)
 	return ret0, ret1
 }
 
-// Max indicates an expected call of Max
-func (mr *MockListMockRecorder) Max() *gomock.Call {
+// CountFast indicates an expected call of CountFast
+func (mr *MockListMockRecorder) CountFast() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Max", reflect.TypeOf((*MockList)(nil).Max))
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CountFast", reflect.TypeOf((*MockList)(nil).CountFast))
 }
 
-// Len mocks base method
-func (m *MockList) Len() int {
+// CountSlow mocks base method
+func (m *MockList) CountSlow() int {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Len")
+	ret := m.ctrl.Call(m, "CountSlow")
 	ret0, _ := ret[0].(int)
 	return ret0
 }
 
-// Len indicates an expected call of Len
-func (mr *MockListMockRecorder) Len() *gomock.Call {
+// CountSlow indicates an expected call of CountSlow
+func (mr *MockListMockRecorder) CountSlow() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Len", reflect.TypeOf((*MockList)(nil).Len))
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CountSlow", reflect.TypeOf((*MockList)(nil).CountSlow))
 }
 
 // Iterator mocks base method
@@ -124,20 +124,6 @@ func (mr *MockListMockRecorder) Iterator() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Iterator", reflect.TypeOf((*MockList)(nil).Iterator))
 }
 
-// Clone mocks base method
-func (m *MockList) Clone() MutableList {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Clone")
-	ret0, _ := ret[0].(MutableList)
-	return ret0
-}
-
-// Clone indicates an expected call of Clone
-func (mr *MockListMockRecorder) Clone() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Clone", reflect.TypeOf((*MockList)(nil).Clone))
-}
-
 // Equal mocks base method
 func (m *MockList) Equal(other List) bool {
 	m.ctrl.T.Helper()
@@ -203,33 +189,33 @@ func (mr *MockMutableListMockRecorder) IsEmpty() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsEmpty", reflect.TypeOf((*MockMutableList)(nil).IsEmpty))
 }
 
-// Max mocks base method
-func (m *MockMutableList) Max() (ID, error) {
+// CountFast mocks base method
+func (m *MockMutableList) CountFast() (int, bool) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Max")
-	ret0, _ := ret[0].(ID)
-	ret1, _ := ret[1].(error)
+	ret := m.ctrl.Call(m, "CountFast")
+	ret0, _ := ret[0].(int)
+	ret1, _ := ret[1].(bool)
 	return ret0, ret1
 }
 
-// Max indicates an expected call of Max
-func (mr *MockMutableListMockRecorder) Max() *gomock.Call {
+// CountFast indicates an expected call of CountFast
+func (mr *MockMutableListMockRecorder) CountFast() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Max", reflect.TypeOf((*MockMutableList)(nil).Max))
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CountFast", reflect.TypeOf((*MockMutableList)(nil).CountFast))
 }
 
-// Len mocks base method
-func (m *MockMutableList) Len() int {
+// CountSlow mocks base method
+func (m *MockMutableList) CountSlow() int {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Len")
+	ret := m.ctrl.Call(m, "CountSlow")
 	ret0, _ := ret[0].(int)
 	return ret0
 }
 
-// Len indicates an expected call of Len
-func (mr *MockMutableListMockRecorder) Len() *gomock.Call {
+// CountSlow indicates an expected call of CountSlow
+func (mr *MockMutableListMockRecorder) CountSlow() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Len", reflect.TypeOf((*MockMutableList)(nil).Len))
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CountSlow", reflect.TypeOf((*MockMutableList)(nil).CountSlow))
 }
 
 // Iterator mocks base method
@@ -246,20 +232,6 @@ func (mr *MockMutableListMockRecorder) Iterator() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Iterator", reflect.TypeOf((*MockMutableList)(nil).Iterator))
 }
 
-// Clone mocks base method
-func (m *MockMutableList) Clone() MutableList {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Clone")
-	ret0, _ := ret[0].(MutableList)
-	return ret0
-}
-
-// Clone indicates an expected call of Clone
-func (mr *MockMutableListMockRecorder) Clone() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Clone", reflect.TypeOf((*MockMutableList)(nil).Clone))
-}
-
 // Equal mocks base method
 func (m *MockMutableList) Equal(other List) bool {
 	m.ctrl.T.Helper()
@@ -386,6 +358,20 @@ func (mr *MockMutableListMockRecorder) RemoveRange(min, max interface{}) *gomock
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveRange", reflect.TypeOf((*MockMutableList)(nil).RemoveRange), min, max)
 }
 
+// Clone mocks base method
+func (m *MockMutableList) Clone() MutableList {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Clone")
+	ret0, _ := ret[0].(MutableList)
+	return ret0
+}
+
+// Clone indicates an expected call of Clone
+func (mr *MockMutableListMockRecorder) Clone() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Clone", reflect.TypeOf((*MockMutableList)(nil).Clone))
+}
+
 // Reset mocks base method
 func (m *MockMutableList) Reset() {
 	m.ctrl.T.Helper()
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index fbf596a477..e0547ee781 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -43,7 +43,8 @@ const (
 )
 
 var (
-	errNotPilosaRoaring = errors.New("not pilosa roaring format")
+	errNotPilosaRoaring  = errors.New("not pilosa roaring format")
+	errNotReadOnlyBitmap = errors.New("not read only bitmap")
 )
 
 type containerType byte
@@ -100,14 +101,27 @@ func NewReadOnlyBitmapIntersectCheck() *ReadOnlyBitmapIntersectCheck {
 	}
 }
 
-func (c *ReadOnlyBitmapIntersectCheck) Intersects(a, b *ReadOnlyBitmap) bool {
-	c.intersect[0] = multiBitmapIterable{bitmap: a}
-	c.intersect[1] = multiBitmapIterable{bitmap: b}
+func (c *ReadOnlyBitmapIntersectCheck) Intersects(a, b postings.List) (bool, error) {
+	if pl, ok := a.(*ReadOnlyBitmap); ok {
+		c.intersect[0] = multiBitmapIterable{bitmap: pl}
+	} else if pl, ok := a.(*multiBitmap); ok {
+		c.intersect[0] = multiBitmapIterable{multiBitmap: pl}
+	} else {
+		return false, errNotReadOnlyBitmap
+	}
+	if pl, ok := b.(*ReadOnlyBitmap); ok {
+		c.intersect[1] = multiBitmapIterable{bitmap: pl}
+	} else if pl, ok := b.(*multiBitmap); ok {
+		c.intersect[1] = multiBitmapIterable{multiBitmap: pl}
+	} else {
+		return false, errNotReadOnlyBitmap
+	}
+
 	c.multiBitmapIterator.Reset(multiBitmapOptions{
 		op:        multiBitmapOpIntersect,
 		intersect: c.intersect,
 	})
-	return c.multiBitmapIterator.Next()
+	return c.multiBitmapIterator.Next(), nil
 }
 
 var _ postings.List = (*ReadOnlyBitmap)(nil)

From fd8e4e578599a64731cf5ea042a80d593a7b236b Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 1 Nov 2020 10:03:17 -0500
Subject: [PATCH 011/106] Fix multi-intersect searches that start with empty
 posting lists

---
 .../postings/roaring/bitmap_multi_readonly.go | 49 +++++++++++--------
 src/m3ninx/search/searcher/all_test.go        |  2 +-
 .../search/searcher/conjunction_test.go       | 45 ++++++++++++++---
 3 files changed, 69 insertions(+), 27 deletions(-)

diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 01f9514e07..53aceda074 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -292,11 +292,17 @@ func (i *multiBitmapIterator) Reset(opts multiBitmapOptions) {
 	if i.iters == nil {
 		i.iters = make([]containerIteratorAndOp, 0, n)
 	}
+	if i.initial == nil {
+		i.initial = make([]containerIteratorAndOp, 0, n)
+	}
 	i.iters = i.iters[:0]
-	i.iters = appendContainerItersWithOp(i.iters, opts.union, multiContainerOpUnion)
-	i.iters = appendContainerItersWithOp(i.iters, opts.intersect, multiContainerOpIntersect)
-	i.iters = appendContainerItersWithOp(i.iters, opts.intersectNegate, multiContainerOpNegate)
-	i.initial = i.iters[:]
+	i.initial = i.initial[:0]
+	i.initial, i.iters = appendContainerItersWithOp(i.initial, i.iters,
+		opts.union, multiContainerOpUnion)
+	i.initial, i.iters = appendContainerItersWithOp(i.initial, i.iters,
+		opts.intersect, multiContainerOpIntersect)
+	i.initial, i.iters = appendContainerItersWithOp(i.initial, i.iters,
+		opts.intersectNegate, multiContainerOpNegate)
 	i.err = nil
 	i.multiContainerIter = multiBitmapContainerIterator{}
 	i.bitmap.Reset(false)
@@ -304,10 +310,11 @@ func (i *multiBitmapIterator) Reset(opts multiBitmapOptions) {
 }
 
 func appendContainerItersWithOp(
-	slice []containerIteratorAndOp,
+	initial []containerIteratorAndOp,
+	iters []containerIteratorAndOp,
 	iterables []multiBitmapIterable,
 	op multiContainerOp,
-) []containerIteratorAndOp {
+) ([]containerIteratorAndOp, []containerIteratorAndOp) {
 	for _, elem := range iterables {
 		var it containerIterator
 		switch {
@@ -317,16 +324,21 @@ func appendContainerItersWithOp(
 			it = elem.bitmap.containerIterator()
 		}
 
+		initial = append(initial, containerIteratorAndOp{
+			it: it,
+			op: op,
+		})
+
 		if !it.NextContainer() {
 			continue
 		}
 
-		slice = append(slice, containerIteratorAndOp{
+		iters = append(iters, containerIteratorAndOp{
 			it: it,
 			op: op,
 		})
 	}
-	return slice
+	return initial, iters
 }
 
 func (i *multiBitmapIterator) Next() bool {
@@ -521,12 +533,16 @@ func newMultiBitmapContainersIterator(
 	opts multiBitmapOptions,
 ) *multiBitmapContainersIterator {
 	var (
-		n     = len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
-		iters = make([]containerIteratorAndOp, 0, n)
+		n       = len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
+		iters   = make([]containerIteratorAndOp, 0, n)
+		initial = make([]containerIteratorAndOp, 0, n)
 	)
-	iters = appendContainerItersWithOp(iters, opts.union, multiContainerOpUnion)
-	iters = appendContainerItersWithOp(iters, opts.intersect, multiContainerOpIntersect)
-	iters = appendContainerItersWithOp(iters, opts.intersectNegate, multiContainerOpNegate)
+	initial, iters = appendContainerItersWithOp(initial, iters,
+		opts.union, multiContainerOpUnion)
+	initial, iters = appendContainerItersWithOp(initial, iters,
+		opts.intersect, multiContainerOpIntersect)
+	initial, iters = appendContainerItersWithOp(initial, iters,
+		opts.intersectNegate, multiContainerOpNegate)
 	return &multiBitmapContainersIterator{
 		multiBitmapOptions: opts,
 		initial:            iters,
@@ -540,13 +556,6 @@ func (i *multiBitmapContainersIterator) NextContainer() bool {
 		return false
 	}
 
-	if i.first {
-		// Always have some valid iterators since we wouldn't
-		// have enqueued if not.
-		i.first = false
-		return true
-	}
-
 	var (
 		ok  bool
 		err error
diff --git a/src/m3ninx/search/searcher/all_test.go b/src/m3ninx/search/searcher/all_test.go
index 18e6f48cf6..c4cedbaaac 100644
--- a/src/m3ninx/search/searcher/all_test.go
+++ b/src/m3ninx/search/searcher/all_test.go
@@ -39,7 +39,7 @@ func TestAllSearcher(t *testing.T) {
 	allPl := roaring.NewPostingsList()
 	reader.EXPECT().MatchAll().Return(allPl, nil)
 
-	pl, _, err := s.Search(reader)
+	pl, err := s.Search(reader)
 	require.NoError(t, err)
 	require.True(t, pl.Equal(allPl))
 }
diff --git a/src/m3ninx/search/searcher/conjunction_test.go b/src/m3ninx/search/searcher/conjunction_test.go
index 17ba622872..32132ae8b5 100644
--- a/src/m3ninx/search/searcher/conjunction_test.go
+++ b/src/m3ninx/search/searcher/conjunction_test.go
@@ -21,6 +21,7 @@
 package searcher
 
 import (
+	"bytes"
 	"testing"
 
 	"github.com/m3db/m3/src/m3ninx/index"
@@ -67,14 +68,32 @@ func TestConjunctionSearcher(t *testing.T) {
 
 	gomock.InOrder(
 		// Get the postings lists for the first Reader.
-		firstSearcher.EXPECT().Search(firstReader).Return(firstPL1, nil),
-		secondSearcher.EXPECT().Search(firstReader).Return(secondPL1, nil),
-		thirdSearcher.EXPECT().Search(firstReader).Return(thirdPL1, nil),
+		firstSearcher.EXPECT().Search(firstReader).
+			DoAndReturn(func(_ index.Reader) (postings.List, error) {
+				return mustReadOnlyBitmap(t, firstPL1), nil
+			}),
+		secondSearcher.EXPECT().Search(firstReader).
+			DoAndReturn(func(_ index.Reader) (postings.List, error) {
+				return mustReadOnlyBitmap(t, secondPL1), nil
+			}),
+		thirdSearcher.EXPECT().Search(firstReader).
+			DoAndReturn(func(_ index.Reader) (postings.List, error) {
+				return mustReadOnlyBitmap(t, thirdPL1), nil
+			}),
 
 		// Get the postings lists for the second Reader.
-		firstSearcher.EXPECT().Search(secondReader).Return(firstPL2, nil),
-		secondSearcher.EXPECT().Search(secondReader).Return(secondPL2, nil),
-		thirdSearcher.EXPECT().Search(secondReader).Return(thirdPL2, nil),
+		firstSearcher.EXPECT().Search(secondReader).
+			DoAndReturn(func(_ index.Reader) (postings.List, error) {
+				return mustReadOnlyBitmap(t, firstPL2), nil
+			}),
+		secondSearcher.EXPECT().Search(secondReader).
+			DoAndReturn(func(_ index.Reader) (postings.List, error) {
+				return mustReadOnlyBitmap(t, secondPL2), nil
+			}),
+		thirdSearcher.EXPECT().Search(secondReader).
+			DoAndReturn(func(_ index.Reader) (postings.List, error) {
+				return mustReadOnlyBitmap(t, thirdPL2), nil
+			}),
 	)
 
 	var (
@@ -120,3 +139,17 @@ func TestConjunctionSearcherError(t *testing.T) {
 		})
 	}
 }
+
+func mustReadOnlyBitmap(t *testing.T, pl postings.List) *roaring.ReadOnlyBitmap {
+	b, ok := roaring.BitmapFromPostingsList(pl)
+	require.True(t, ok)
+
+	buff := bytes.NewBuffer(nil)
+	_, err := b.WriteTo(buff)
+	require.NoError(t, err)
+
+	readOnlyBitmap, err := roaring.NewReadOnlyBitmap(buff.Bytes())
+	require.NoError(t, err)
+
+	return readOnlyBitmap
+}

From f2688e070e579e7193068076c3b5b60f54eaa887 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 2 Nov 2020 14:24:27 -0500
Subject: [PATCH 012/106] Refactor read only bitmap range to own datastructure

---
 .../services/m3coordinator/ingest/metrics.go  |   2 +-
 ...index_single_node_high_concurrency_test.go |  18 +-
 .../storage/index/fields_terms_iterator.go    |   3 +
 src/m3ninx/index/segment/fst/segment.go       |   2 +-
 src/m3ninx/postings/compare.go                |   6 +-
 .../postings/roaring/bitmap_multi_readonly.go | 155 ++++++------
 .../roaring/bitmap_multi_readonly_test.go     |   4 +-
 .../postings/roaring/bitmap_readonly.go       | 221 ++----------------
 .../postings/roaring/bitmap_readonly_range.go | 203 ++++++++++++++++
 .../roaring/bitmap_readonly_range_test.go     |  90 +++++++
 10 files changed, 411 insertions(+), 293 deletions(-)
 create mode 100644 src/m3ninx/postings/roaring/bitmap_readonly_range.go
 create mode 100644 src/m3ninx/postings/roaring/bitmap_readonly_range_test.go

diff --git a/src/cmd/services/m3coordinator/ingest/metrics.go b/src/cmd/services/m3coordinator/ingest/metrics.go
index ad514fc117..19b60d57bd 100644
--- a/src/cmd/services/m3coordinator/ingest/metrics.go
+++ b/src/cmd/services/m3coordinator/ingest/metrics.go
@@ -26,7 +26,7 @@ import (
 	"github.com/uber-go/tally"
 )
 
-// LatencyBuckets are a set of latency buckets useful for measuring things.
+// LatencyBuckets are a set of latency buckets useful for measuring latencies.
 type LatencyBuckets struct {
 	WriteLatencyBuckets  tally.DurationBuckets
 	IngestLatencyBuckets tally.DurationBuckets
diff --git a/src/dbnode/integration/index_single_node_high_concurrency_test.go b/src/dbnode/integration/index_single_node_high_concurrency_test.go
index 2480f28b02..911b77cc3a 100644
--- a/src/dbnode/integration/index_single_node_high_concurrency_test.go
+++ b/src/dbnode/integration/index_single_node_high_concurrency_test.go
@@ -26,6 +26,7 @@ import (
 	"fmt"
 	"math/rand"
 	"strconv"
+	"strings"
 	"sync"
 	"testing"
 	"time"
@@ -377,7 +378,7 @@ func testIndexSingleNodeHighConcurrency(
 		// Now check all of them are individually indexed.
 		var (
 			fetchWg        sync.WaitGroup
-			notIndexedErrs []error
+			notIndexedErrs []string
 			notIndexedLock sync.Mutex
 		)
 		for i := 0; i < opts.concurrencyEnqueueWorker; i++ {
@@ -401,10 +402,18 @@ func testIndexSingleNodeHighConcurrency(
 							found := isIndexed(t, session, md.ID(), id, tags)
 							return found
 						}, 30*time.Second)
+
+						// Final check to get the corrersponding error/mismatch.
+						indexed, err := isIndexedChecked(t, session, md.ID(), id, tags)
 						if !indexed {
-							err := fmt.Errorf("not indexed series: i=%d, j=%d", i, j)
+							if err != nil {
+								err = fmt.Errorf("not indexed: i=%d, j=%d, err=%v", i, j, err)
+							} else {
+								err = fmt.Errorf("not indexed: i=%d, j=%d, err=none", i, j)
+							}
+
 							notIndexedLock.Lock()
-							notIndexedErrs = append(notIndexedErrs, err)
+							notIndexedErrs = append(notIndexedErrs, err.Error())
 							notIndexedLock.Unlock()
 						}
 					})
@@ -414,7 +423,8 @@ func testIndexSingleNodeHighConcurrency(
 		fetchWg.Wait()
 
 		require.Equal(t, 0, len(notIndexedErrs),
-			fmt.Sprintf("not indexed errors: %v", notIndexedErrs[:min(5, len(notIndexedErrs))]))
+			fmt.Sprintf("not indexed errors: [%v]",
+				strings.Join(notIndexedErrs[:min(5, len(notIndexedErrs))], ", ")))
 	}
 
 	log.Info("data indexing verify done", zap.Duration("took", time.Since(start)))
diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index 9941a5bd13..eb1bbad689 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -260,6 +260,9 @@ func (fti *fieldsAndTermsIter) Close() error {
 	if fti.termIter != nil {
 		multiErr = multiErr.Add(fti.termIter.Close())
 	}
+	if fti.restrictByPostingsIntersect != nil {
+		multiErr = multiErr.Add(fti.restrictByPostingsIntersect.Close())
+	}
 	multiErr = multiErr.Add(fti.Reset(nil, fieldsAndTermsIteratorOpts{}))
 	return multiErr.FinalError()
 }
diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index 4f71ad496d..bd101b32ae 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -574,7 +574,7 @@ func (r *fsSegment) matchAllNotClosedMaybeFinalizedWithRLock() (postings.List, e
 	// NB(r): Important this is a read only bitmap since we perform
 	// operations on postings lists and expect them all to be read only
 	// postings lists.
-	return roaring.NewReadOnlyBitmapRange(0, uint64(r.numDocs))
+	return roaring.NewReadOnlyRangePostingsList(0, uint64(r.numDocs))
 }
 
 func (r *fsSegment) docNotClosedMaybeFinalizedWithRLock(id postings.ID) (doc.Document, error) {
diff --git a/src/m3ninx/postings/compare.go b/src/m3ninx/postings/compare.go
index 561bc3f773..c4728abff6 100644
--- a/src/m3ninx/postings/compare.go
+++ b/src/m3ninx/postings/compare.go
@@ -28,9 +28,11 @@ func Equal(a, b List) bool {
 		return false
 	}
 
-	iter := a.Iterator()
-	otherIter := b.Iterator()
+	return EqualIterator(a.Iterator(), b.Iterator())
+}
 
+// EqualIterator compares two posting lists iterators for equality.
+func EqualIterator(iter, otherIter Iterator) bool {
 	closed := false
 	defer func() {
 		if !closed {
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 53aceda074..bdd694b02a 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -21,7 +21,6 @@
 package roaring
 
 import (
-	"errors"
 	"fmt"
 	"math/bits"
 	"sync"
@@ -29,28 +28,16 @@ import (
 	"github.com/m3db/m3/src/m3ninx/postings"
 )
 
-var (
-	// ErrNotReadOnlyBitmaps returned from operations that expect read only bitmaps.
-	ErrNotReadOnlyBitmaps = errors.New("not read only bitmaps")
-)
-
 // UnionReadOnly expects postings lists to all be read only.
 func UnionReadOnly(unions []postings.List) (postings.List, error) {
-	union := make([]multiBitmapIterable, 0, len(unions))
+	union := make([]readOnlyIterable, 0, len(unions))
 	for _, elem := range unions {
-		b, ok := elem.(*ReadOnlyBitmap)
-		if ok {
-			union = append(union, multiBitmapIterable{bitmap: b})
-			continue
-		}
-
-		mb, ok := elem.(*multiBitmap)
-		if ok {
-			union = append(union, multiBitmapIterable{multiBitmap: mb})
-			continue
+		b, ok := elem.(readOnlyIterable)
+		if !ok {
+			return nil, ErrNotReadOnlyBitmap
 		}
 
-		return nil, ErrNotReadOnlyBitmaps
+		union = append(union, b)
 	}
 
 	return newMultiBitmap(multiBitmapOptions{
@@ -64,38 +51,24 @@ func IntersectAndNegateReadOnly(
 	intersects []postings.List,
 	negates []postings.List,
 ) (postings.List, error) {
-	intersect := make([]multiBitmapIterable, 0, len(intersects))
+	intersect := make([]readOnlyIterable, 0, len(intersects))
 	for _, elem := range intersects {
-		b, ok := elem.(*ReadOnlyBitmap)
-		if ok {
-			intersect = append(intersect, multiBitmapIterable{bitmap: b})
-			continue
-		}
-
-		mb, ok := elem.(*multiBitmap)
-		if ok {
-			intersect = append(intersect, multiBitmapIterable{multiBitmap: mb})
-			continue
+		b, ok := elem.(readOnlyIterable)
+		if !ok {
+			return nil, ErrNotReadOnlyBitmap
 		}
 
-		return nil, ErrNotReadOnlyBitmaps
+		intersect = append(intersect, b)
 	}
 
-	negate := make([]multiBitmapIterable, 0, len(negates))
+	negate := make([]readOnlyIterable, 0, len(negates))
 	for _, elem := range negates {
-		b, ok := elem.(*ReadOnlyBitmap)
-		if ok {
-			negate = append(negate, multiBitmapIterable{bitmap: b})
-			continue
-		}
-
-		mb, ok := elem.(*multiBitmap)
-		if ok {
-			negate = append(negate, multiBitmapIterable{multiBitmap: mb})
-			continue
+		b, ok := elem.(readOnlyIterable)
+		if !ok {
+			return nil, ErrNotReadOnlyBitmap
 		}
 
-		return nil, ErrNotReadOnlyBitmaps
+		negate = append(negate, b)
 	}
 
 	return newMultiBitmap(multiBitmapOptions{
@@ -105,7 +78,49 @@ func IntersectAndNegateReadOnly(
 	})
 }
 
+// ReadOnlyBitmapIntersectCheck is a check that can be repeated
+// against read only bitmaps without allocations.
+type ReadOnlyBitmapIntersectCheck struct {
+	multiBitmapIterator *multiBitmapIterator
+	intersect           []readOnlyIterable
+}
+
+// NewReadOnlyBitmapIntersectCheck creates a new bitmap intersect checker,
+// it is zero allocation once allocated to compare two bitmaps.
+func NewReadOnlyBitmapIntersectCheck() *ReadOnlyBitmapIntersectCheck {
+	return &ReadOnlyBitmapIntersectCheck{
+		multiBitmapIterator: newMultiBitmapIterator(multiBitmapOptions{}),
+		intersect:           make([]readOnlyIterable, 2),
+	}
+}
+
+// Intersects returns whether two posting lists intersect or not.
+func (c *ReadOnlyBitmapIntersectCheck) Intersects(a, b postings.List) (bool, error) {
+	if pl, ok := a.(readOnlyIterable); ok {
+		c.intersect[0] = pl
+	} else {
+		return false, ErrNotReadOnlyBitmap
+	}
+	if pl, ok := b.(readOnlyIterable); ok {
+		c.intersect[1] = pl
+	} else {
+		return false, ErrNotReadOnlyBitmap
+	}
+
+	c.multiBitmapIterator.Reset(multiBitmapOptions{
+		op:        multiBitmapOpIntersect,
+		intersect: c.intersect,
+	})
+	return c.multiBitmapIterator.Next(), nil
+}
+
+// Close will close the intersect checker.
+func (c *ReadOnlyBitmapIntersectCheck) Close() error {
+	return c.multiBitmapIterator.Close()
+}
+
 var _ postings.List = (*multiBitmap)(nil)
+var _ readOnlyIterable = (*multiBitmap)(nil)
 
 type multiBitmapOp uint8
 
@@ -133,29 +148,31 @@ type multiBitmap struct {
 	multiBitmapOptions
 }
 
-// multiBitmapIterable either contains a bitmap or another multi-iter.
-type multiBitmapIterable struct {
-	multiBitmap *multiBitmap
-	bitmap      *ReadOnlyBitmap
+type readOnlyIterable interface {
+	Contains(id postings.ID) bool
+	ContainerIterator() containerIterator
 }
 
-func (i multiBitmapIterable) Contains(id postings.ID) bool {
-	if i.multiBitmap != nil {
-		return i.multiBitmap.Contains(id)
-	}
-	return i.bitmap.Contains(id)
+type containerIterator interface {
+	NextContainer() bool
+	ContainerKey() uint64
+	ContainerUnion(ctx containerOpContext, target *bitmapContainer)
+	ContainerIntersect(ctx containerOpContext, target *bitmapContainer)
+	ContainerNegate(ctx containerOpContext, target *bitmapContainer)
+	Err() error
+	Close()
 }
 
 type multiBitmapOptions struct {
 	op multiBitmapOp
 
 	// union is valid when multiBitmapOpUnion, no other options valid.
-	union []multiBitmapIterable
+	union []readOnlyIterable
 
 	// intersect is valid when multiBitmapOpIntersect used.
-	intersect []multiBitmapIterable
+	intersect []readOnlyIterable
 	// intersectNegate is valid when multiBitmapOpIntersect used.
-	intersectNegate []multiBitmapIterable
+	intersectNegate []readOnlyIterable
 }
 
 func (o multiBitmapOptions) validate() error {
@@ -222,7 +239,7 @@ func (i *multiBitmap) Iterator() postings.Iterator {
 	return newMultiBitmapIterator(i.multiBitmapOptions)
 }
 
-func (i *multiBitmap) containerIterator() containerIterator {
+func (i *multiBitmap) ContainerIterator() containerIterator {
 	return newMultiBitmapContainersIterator(i.multiBitmapOptions)
 }
 
@@ -258,16 +275,6 @@ const (
 	multiContainerOpNegate
 )
 
-type containerIterator interface {
-	NextContainer() bool
-	ContainerKey() uint64
-	ContainerUnion(ctx containerOpContext, target *bitmapContainer)
-	ContainerIntersect(ctx containerOpContext, target *bitmapContainer)
-	ContainerNegate(ctx containerOpContext, target *bitmapContainer)
-	Err() error
-	Close()
-}
-
 type containerOpContext struct {
 	// tempBitmap is useful for temporary scratch operations and allows
 	// for all sub-operations to share it rather than one per underlying
@@ -312,17 +319,11 @@ func (i *multiBitmapIterator) Reset(opts multiBitmapOptions) {
 func appendContainerItersWithOp(
 	initial []containerIteratorAndOp,
 	iters []containerIteratorAndOp,
-	iterables []multiBitmapIterable,
+	iterables []readOnlyIterable,
 	op multiContainerOp,
 ) ([]containerIteratorAndOp, []containerIteratorAndOp) {
 	for _, elem := range iterables {
-		var it containerIterator
-		switch {
-		case elem.multiBitmap != nil:
-			it = elem.multiBitmap.containerIterator()
-		case elem.bitmap != nil:
-			it = elem.bitmap.containerIterator()
-		}
+		it := elem.ContainerIterator()
 
 		initial = append(initial, containerIteratorAndOp{
 			it: it,
@@ -783,10 +784,10 @@ func (b *bitmapContainer) Reset(set bool) {
 	b.bitmap = b.allocated
 }
 
-func (b *bitmapContainer) SetReadOnly(curr []uint64) {
-	// SetReadOnly should be used with care, only for single bitmap
-	// iteration.
-	b.bitmap = curr
+func (b *bitmapContainer) readOnlyContainer() bitmapReadOnlyContainer {
+	return bitmapReadOnlyContainer{
+		values: b.bitmap,
+	}
 }
 
 type bitmapContainerIterator struct {
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
index 9060586013..2f19c7ef23 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
@@ -146,8 +146,8 @@ func TestMultiBitmap(t *testing.T) {
 			insertCount: test.insertCount,
 		}
 		for i := 0; i < each; i++ {
-			t.Run(fmt.Sprintf("attempt=%d, test=+%v", i, test), func(t *testing.T) {
-				allReadOnly, err := NewReadOnlyBitmapRange(0, uint64(test.insertRange))
+			t.Run(fmt.Sprintf("i=%d, test=+%v", i, test), func(t *testing.T) {
+				allReadOnly, err := NewReadOnlyRangePostingsList(0, uint64(test.insertRange))
 				require.NoError(t, err)
 
 				reg, regReadOnly :=
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index e0547ee781..8af61656ad 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -43,8 +43,10 @@ const (
 )
 
 var (
-	errNotPilosaRoaring  = errors.New("not pilosa roaring format")
-	errNotReadOnlyBitmap = errors.New("not read only bitmap")
+	// ErrNotReadOnlyBitmap returned from operations that expect read only bitmaps.
+	ErrNotReadOnlyBitmap = errors.New("not read only bitmap")
+
+	errNotPilosaRoaring = errors.New("not pilosa roaring format")
 )
 
 type containerType byte
@@ -87,44 +89,8 @@ func ReadOnlyBitmapFromPostingsList(pl postings.List) (*ReadOnlyBitmap, bool) {
 	return result, true
 }
 
-// ReadOnlyBitmapIntersectCheck is a check that can be repeated
-// against read only bitmaps without allocations.
-type ReadOnlyBitmapIntersectCheck struct {
-	multiBitmapIterator *multiBitmapIterator
-	intersect           []multiBitmapIterable
-}
-
-func NewReadOnlyBitmapIntersectCheck() *ReadOnlyBitmapIntersectCheck {
-	return &ReadOnlyBitmapIntersectCheck{
-		multiBitmapIterator: newMultiBitmapIterator(multiBitmapOptions{}),
-		intersect:           make([]multiBitmapIterable, 2),
-	}
-}
-
-func (c *ReadOnlyBitmapIntersectCheck) Intersects(a, b postings.List) (bool, error) {
-	if pl, ok := a.(*ReadOnlyBitmap); ok {
-		c.intersect[0] = multiBitmapIterable{bitmap: pl}
-	} else if pl, ok := a.(*multiBitmap); ok {
-		c.intersect[0] = multiBitmapIterable{multiBitmap: pl}
-	} else {
-		return false, errNotReadOnlyBitmap
-	}
-	if pl, ok := b.(*ReadOnlyBitmap); ok {
-		c.intersect[1] = multiBitmapIterable{bitmap: pl}
-	} else if pl, ok := b.(*multiBitmap); ok {
-		c.intersect[1] = multiBitmapIterable{multiBitmap: pl}
-	} else {
-		return false, errNotReadOnlyBitmap
-	}
-
-	c.multiBitmapIterator.Reset(multiBitmapOptions{
-		op:        multiBitmapOpIntersect,
-		intersect: c.intersect,
-	})
-	return c.multiBitmapIterator.Next(), nil
-}
-
 var _ postings.List = (*ReadOnlyBitmap)(nil)
+var _ readOnlyIterable = (*ReadOnlyBitmap)(nil)
 
 // ReadOnlyBitmap is a read only roaring Bitmap of
 // pilosa encoded roaring bitmaps, allocates very little on unmarshal
@@ -134,10 +100,6 @@ var _ postings.List = (*ReadOnlyBitmap)(nil)
 type ReadOnlyBitmap struct {
 	data []byte
 	keyN uint64
-
-	rangeOverride       bool // if rangeOverride then just a read only range
-	rangeStartInclusive uint64
-	rangeEndExclusive   uint64
 }
 
 // NewReadOnlyBitmap returns a new read only bitmap.
@@ -149,28 +111,8 @@ func NewReadOnlyBitmap(data []byte) (*ReadOnlyBitmap, error) {
 	return b, nil
 }
 
-// NewReadOnlyBitmapRange returns a special read only bitmap that
-// represents a range.
-func NewReadOnlyBitmapRange(
-	startInclusive, endExclusive uint64,
-) (*ReadOnlyBitmap, error) {
-	if endExclusive < startInclusive {
-		return nil, fmt.Errorf("end cannot be before start: start=%d, end=%d",
-			startInclusive, endExclusive)
-	}
-
-	return &ReadOnlyBitmap{
-		rangeOverride:       true,
-		rangeStartInclusive: startInclusive,
-		rangeEndExclusive:   endExclusive,
-	}, nil
-}
-
+// Reset resets the read only bitmap.
 func (b *ReadOnlyBitmap) Reset(data []byte) error {
-	b.rangeOverride = false
-	b.rangeStartInclusive = 0
-	b.rangeEndExclusive = 0
-
 	// Reset to nil.
 	if len(data) == 0 {
 		b.data = nil
@@ -368,13 +310,8 @@ func (b *ReadOnlyBitmap) containerAtIndex(index uint64) (readOnlyContainer, erro
 	return container, nil
 }
 
+// Contains returns whether postings ID is contained or not.
 func (b *ReadOnlyBitmap) Contains(id postings.ID) bool {
-	if b.rangeOverride {
-		// Using range override.
-		return uint64(id) >= b.rangeStartInclusive &&
-			uint64(id) < b.rangeEndExclusive
-	}
-
 	value := uint64(id)
 	container, ok := b.container(highbits(value))
 	if !ok {
@@ -392,10 +329,8 @@ func (b *ReadOnlyBitmap) Contains(id postings.ID) bool {
 	return false
 }
 
+// IsEmpty returns true if no results contained by postings.
 func (b *ReadOnlyBitmap) IsEmpty() bool {
-	if b.rangeOverride {
-		return b.rangeCount() == 0
-	}
 	return b.count() == 0
 }
 
@@ -414,40 +349,28 @@ func (b *ReadOnlyBitmap) count() int {
 	return l
 }
 
-func (b *ReadOnlyBitmap) rangeCount() int {
-	return int(b.rangeEndExclusive - b.rangeStartInclusive)
-}
-
+// CountFast returns the count of entries in postings, if available, false
+// if cannot calculate quickly.
 func (b *ReadOnlyBitmap) CountFast() (int, bool) {
-	if b.rangeOverride {
-		return b.rangeCount(), true
-	}
 	return b.count(), true
 }
 
+// CountSlow returns the count of entries in postings.
 func (b *ReadOnlyBitmap) CountSlow() int {
-	if b.rangeOverride {
-		return b.rangeCount()
-	}
 	return b.count()
 }
 
+// Iterator returns a postings iterator.
 func (b *ReadOnlyBitmap) Iterator() postings.Iterator {
-	if b.rangeOverride {
-		return postings.NewRangeIterator(postings.ID(b.rangeStartInclusive),
-			postings.ID(b.rangeEndExclusive))
-	}
 	return newReadOnlyBitmapIterator(b)
 }
 
-func (b *ReadOnlyBitmap) containerIterator() containerIterator {
-	if b.rangeOverride {
-		return newReadOnlyBitmapRangeContainerIterator(b.rangeStartInclusive,
-			b.rangeEndExclusive)
-	}
+// ContainerIterator returns a container iterator of the postings.
+func (b *ReadOnlyBitmap) ContainerIterator() containerIterator {
 	return newReadOnlyBitmapContainerIterator(b)
 }
 
+// Equal returns whether this postings list matches another.
 func (b *ReadOnlyBitmap) Equal(other postings.List) bool {
 	return postings.Equal(b, other)
 }
@@ -854,117 +777,3 @@ func differenceBitmapInPlace(a, b []uint64) {
 		ab[i+3] &= (^bb[i+3])
 	}
 }
-
-var _ containerIterator = (*readOnlyBitmapRangeContainerIterator)(nil)
-
-type readOnlyBitmapRangeContainerIterator struct {
-	startInclusive int64 // use int64 so endInclusive can be -1 if need be
-	endInclusive   int64 // use int64 so endInclusive can be -1 if need be
-	first          bool
-	key            int64
-}
-
-func newReadOnlyBitmapRangeContainerIterator(
-	startInclusive, endExclusive uint64,
-) *readOnlyBitmapRangeContainerIterator {
-	return &readOnlyBitmapRangeContainerIterator{
-		startInclusive: int64(startInclusive),
-		endInclusive:   int64(endExclusive - 1),
-		key:            int64(startInclusive) / containerValues,
-	}
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) startInKey() bool {
-	return i.key == i.startInclusive/containerValues
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) endInKey() bool {
-	return i.key == i.endInclusive/containerValues
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) validKey() bool {
-	return i.key <= i.endInclusive/containerValues
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) NextContainer() bool {
-	if !i.first {
-		i.first = true
-		return i.validKey()
-	}
-
-	if !i.validKey() {
-		return false
-	}
-
-	i.key++
-	return i.validKey()
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) ContainerKey() uint64 {
-	return uint64(i.key)
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) ContainerUnion(
-	ctx containerOpContext,
-	target *bitmapContainer,
-) {
-	start := uint64(0)
-	if i.startInKey() {
-		start = uint64(i.startInclusive) % containerValues
-	}
-
-	end := uint64(containerValues) - 1
-	if i.endInKey() {
-		end = uint64(i.endInclusive) % containerValues
-	}
-
-	// Set from [start, end+1) to union.
-	bitmapSetRange(target.bitmap, start, end+1)
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) ContainerIntersect(
-	ctx containerOpContext,
-	target *bitmapContainer,
-) {
-	start := uint64(0)
-	if i.startInKey() {
-		start = uint64(i.startInclusive) % containerValues
-	}
-
-	end := uint64(containerValues) - 1
-	if i.endInKey() {
-		end = uint64(i.endInclusive) % containerValues
-	}
-
-	// Create temp overlay and intersect with that.
-	ctx.tempBitmap.Reset(false)
-	bitmapSetRange(ctx.tempBitmap.bitmap, start, end+1)
-	intersectBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) ContainerNegate(
-	ctx containerOpContext,
-	target *bitmapContainer,
-) {
-	start := uint64(0)
-	if i.startInKey() {
-		start = uint64(i.startInclusive) % containerValues
-	}
-
-	end := uint64(containerValues) - 1
-	if i.endInKey() {
-		end = uint64(i.endInclusive) % containerValues
-	}
-
-	// Create temp overlay and intersect with that.
-	ctx.tempBitmap.Reset(false)
-	bitmapSetRange(ctx.tempBitmap.bitmap, start, end+1)
-	differenceBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) Err() error {
-	return nil
-}
-
-func (i *readOnlyBitmapRangeContainerIterator) Close() {
-}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly_range.go b/src/m3ninx/postings/roaring/bitmap_readonly_range.go
new file mode 100644
index 0000000000..c1ff0c53ce
--- /dev/null
+++ b/src/m3ninx/postings/roaring/bitmap_readonly_range.go
@@ -0,0 +1,203 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package roaring
+
+import (
+	"fmt"
+
+	"github.com/m3db/m3/src/m3ninx/postings"
+)
+
+var _ postings.List = (*ReadOnlyRangePostingsList)(nil)
+var _ readOnlyIterable = (*ReadOnlyRangePostingsList)(nil)
+
+// ReadOnlyRangePostingsList is a read only range based postings list,
+// useful since it imlements the read only iterable interface and can
+// therefore be used with UnionReadOnly and IntersectAndNegateReadOnly.
+type ReadOnlyRangePostingsList struct {
+	startInclusive uint64
+	endExclusive   uint64
+}
+
+// NewReadOnlyRangePostingsList returns a new read only range postings list
+// that can be used with UnionReadOnly and IntersectAndNegateReadOnly.
+func NewReadOnlyRangePostingsList(
+	startInclusive, endExclusive uint64,
+) (*ReadOnlyRangePostingsList, error) {
+	if endExclusive < startInclusive {
+		return nil, fmt.Errorf("end cannot be before start: start=%d, end=%d",
+			startInclusive, endExclusive)
+	}
+	return &ReadOnlyRangePostingsList{
+		startInclusive: startInclusive,
+		endExclusive:   endExclusive,
+	}, nil
+}
+
+// Contains returns whether postings ID is contained or not.
+func (b *ReadOnlyRangePostingsList) Contains(id postings.ID) bool {
+	return uint64(id) >= b.startInclusive && uint64(id) < b.endExclusive
+}
+
+func (b *ReadOnlyRangePostingsList) count() int {
+	return int(b.endExclusive - b.startInclusive)
+}
+
+// IsEmpty returns true if no results contained by postings.
+func (b *ReadOnlyRangePostingsList) IsEmpty() bool {
+	return b.count() == 0
+}
+
+// CountFast returns the count of entries in postings, if available, false
+// if cannot calculate quickly.
+func (b *ReadOnlyRangePostingsList) CountFast() (int, bool) {
+	return b.count(), true
+}
+
+// CountSlow returns the count of entries in postings.
+func (b *ReadOnlyRangePostingsList) CountSlow() int {
+	return b.count()
+}
+
+// Iterator returns a postings iterator.
+func (b *ReadOnlyRangePostingsList) Iterator() postings.Iterator {
+	return postings.NewRangeIterator(postings.ID(b.startInclusive),
+		postings.ID(b.endExclusive))
+}
+
+// ContainerIterator returns a container iterator of the postings.
+func (b *ReadOnlyRangePostingsList) ContainerIterator() containerIterator {
+	return newReadOnlyRangePostingsListContainerIterator(b.startInclusive,
+		b.endExclusive)
+}
+
+// Equal returns whether this postings list matches another.
+func (b *ReadOnlyRangePostingsList) Equal(other postings.List) bool {
+	return postings.Equal(b, other)
+}
+
+var _ containerIterator = (*readOnlyRangePostingsListContainerIterator)(nil)
+
+type readOnlyRangePostingsListContainerIterator struct {
+	startInclusive int64 // use int64 so endInclusive can be -1 if need be
+	endInclusive   int64 // use int64 so endInclusive can be -1 if need be
+	key            int64
+}
+
+func newReadOnlyRangePostingsListContainerIterator(
+	startInclusive, endExclusive uint64,
+) *readOnlyRangePostingsListContainerIterator {
+	return &readOnlyRangePostingsListContainerIterator{
+		startInclusive: int64(startInclusive),
+		endInclusive:   int64(endExclusive - 1),
+		key:            (int64(startInclusive) / containerValues) - 1,
+	}
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) startInKey() bool {
+	return i.key == i.startInclusive/containerValues
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) endInKey() bool {
+	return i.key == i.endInclusive/containerValues
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) validKey() bool {
+	return i.key <= i.endInclusive/containerValues
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) NextContainer() bool {
+	if !i.validKey() {
+		return false
+	}
+
+	i.key++
+	return i.validKey()
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) ContainerKey() uint64 {
+	return uint64(i.key)
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) ContainerUnion(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	start := uint64(0)
+	if i.startInKey() {
+		start = uint64(i.startInclusive) % containerValues
+	}
+
+	end := uint64(containerValues) - 1
+	if i.endInKey() {
+		end = uint64(i.endInclusive) % containerValues
+	}
+
+	// Set from [start, end+1) to union.
+	bitmapSetRange(target.bitmap, start, end+1)
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) ContainerIntersect(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	start := uint64(0)
+	if i.startInKey() {
+		start = uint64(i.startInclusive) % containerValues
+	}
+
+	end := uint64(containerValues) - 1
+	if i.endInKey() {
+		end = uint64(i.endInclusive) % containerValues
+	}
+
+	// Create temp overlay and intersect with that.
+	ctx.tempBitmap.Reset(false)
+	bitmapSetRange(ctx.tempBitmap.bitmap, start, end+1)
+	intersectBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) ContainerNegate(
+	ctx containerOpContext,
+	target *bitmapContainer,
+) {
+	start := uint64(0)
+	if i.startInKey() {
+		start = uint64(i.startInclusive) % containerValues
+	}
+
+	end := uint64(containerValues) - 1
+	if i.endInKey() {
+		end = uint64(i.endInclusive) % containerValues
+	}
+
+	// Create temp overlay and intersect with that.
+	ctx.tempBitmap.Reset(false)
+	bitmapSetRange(ctx.tempBitmap.bitmap, start, end+1)
+	differenceBitmapInPlace(target.bitmap, ctx.tempBitmap.bitmap)
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) Err() error {
+	return nil
+}
+
+func (i *readOnlyRangePostingsListContainerIterator) Close() {
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly_range_test.go b/src/m3ninx/postings/roaring/bitmap_readonly_range_test.go
new file mode 100644
index 0000000000..af99120858
--- /dev/null
+++ b/src/m3ninx/postings/roaring/bitmap_readonly_range_test.go
@@ -0,0 +1,90 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package roaring
+
+import (
+	"fmt"
+	"math/rand"
+	"testing"
+
+	"github.com/m3db/m3/src/m3ninx/postings"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestReadonlyRangePostingsListContainerIterator(t *testing.T) {
+	type testDef struct {
+		startLeadingEmptyContainers int
+		startAtBoundary             bool
+		endLeadingEmptyContainers   int
+		endAtBoundary               bool
+	}
+
+	var tests []testDef
+	for _, startAtBoundary := range []bool{false, true} {
+		for _, endAtBoundary := range []bool{false, true} {
+			emptyContainers := []int{0, 1, 3}
+			for _, startLeadingEmptyContainers := range emptyContainers {
+				for _, endLeadingEmptyContainers := range emptyContainers {
+					tests = append(tests, testDef{
+						startLeadingEmptyContainers: startLeadingEmptyContainers,
+						startAtBoundary:             startAtBoundary,
+						endLeadingEmptyContainers:   endLeadingEmptyContainers,
+						endAtBoundary:               endAtBoundary,
+					})
+				}
+			}
+		}
+	}
+
+	rng := rand.New(rand.NewSource(seed))
+	for i, test := range tests {
+		t.Run(fmt.Sprintf("i=%d, test=+%v", i, test), func(t *testing.T) {
+			start := uint64(test.startLeadingEmptyContainers) * containerValues
+			if !test.startAtBoundary {
+				start += uint64(rng.Int63n(int64(containerValues)))
+			}
+
+			end := (uint64(test.startLeadingEmptyContainers) * containerValues) +
+				(uint64(test.endLeadingEmptyContainers) * containerValues)
+			if !test.endAtBoundary {
+				// Calculate random number with the remaining range we have.
+				endMin := end
+				if start > endMin {
+					endMin = start
+				}
+				endMax := end + containerValues
+				endRange := endMax - endMin
+				// Calculate new end as min + range.
+				end = endMin + uint64(rng.Int63n(int64(endRange)))
+			} else {
+				// Add an extra container's worth of values to interpret
+				end += containerValues
+			}
+
+			pl, err := NewReadOnlyRangePostingsList(start, end)
+			require.NoError(t, err)
+
+			expected := postings.NewRangeIterator(postings.ID(start), postings.ID(end))
+			require.True(t, postings.EqualIterator(expected, pl.Iterator()))
+		})
+	}
+}

From c1fbb80ba1b4529b483654fd65a559229fa8903e Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 3 Nov 2020 00:19:08 -0500
Subject: [PATCH 013/106] Add ability for RO postings lists to be enabled by
 env var

---
 .buildkite/pipeline.yml                       | 20 +++-
 .../aggregator/docker-compose.yml             |  2 +
 .../aggregator_legacy/docker-compose.yml      |  2 +
 .../carbon/docker-compose.yml                 |  2 +
 .../cold_writes_simple/docker-compose.yml     |  2 +
 .../docker-compose.yml                        |  2 +
 .../docker-compose.yml                        |  1 +
 .../multi_cluster_write/docker-compose.yml    |  4 +
 .../prometheus/docker-compose.yml             |  2 +
 .../prometheus_replication/docker-compose.yml |  4 +
 .../query_fanout/docker-compose.yml           |  6 ++
 .../repair/docker-compose.yml                 |  2 +
 .../repair_and_replication/docker-compose.yml |  4 +
 .../replication/docker-compose.yml            |  4 +
 .../simple_v2_batch_apis/docker-compose.yml   |  2 +
 src/dbnode/server/server.go                   |  8 ++
 .../storage/index/fields_terms_iterator.go    | 86 ++++++++++++----
 .../iterator_test.go => index/migration.go}   | 44 +++++----
 ...i_segments_multi_key_postings_list_iter.go | 14 ++-
 .../builder/multi_segments_terms_iter.go      | 15 ++-
 .../fst/fst_terms_postings_iterator.go        | 50 ++++++++--
 src/m3ninx/index/segment/fst/options.go       | 18 ++++
 src/m3ninx/index/segment/fst/segment.go       | 98 ++++++++++++++-----
 .../mem/concurrent_postings_map_test.go       |  4 +-
 src/m3ninx/index/segment/mem/segment.go       |  2 +-
 src/m3ninx/index/segment/mem/terms_dict.go    |  8 +-
 .../index/segment/mem/terms_dict_test.go      |  4 +-
 src/m3ninx/index/segment/mem/types.go         |  2 +-
 src/m3ninx/postings/pilosa/codec.go           | 31 ++----
 src/m3ninx/postings/pilosa/iterator.go        | 70 -------------
 .../postings/roaring/bitmap_multi_readonly.go |  2 +-
 .../roaring/bitmap_multi_readonly_test.go     | 48 ++++++---
 src/m3ninx/postings/roaring/roaring.go        | 10 ++
 src/m3ninx/search/searcher/conjunction.go     | 30 +++++-
 .../search/searcher/conjunction_test.go       | 78 +++++++++++++++
 src/m3ninx/search/searcher/disjunction.go     | 31 ++++--
 src/m3ninx/search/searcher/empty.go           |  5 +-
 src/m3ninx/search/searcher/negation.go        | 27 ++++-
 38 files changed, 532 insertions(+), 212 deletions(-)
 rename src/m3ninx/{postings/pilosa/iterator_test.go => index/migration.go} (59%)
 delete mode 100644 src/m3ninx/postings/pilosa/iterator.go

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index c71d836da5..de3fe8b3bf 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -42,7 +42,7 @@ steps:
         run: app
         workdir: /go/src/github.com/m3db/m3
     <<: *common
-  - name: "Integration (:docker:)"
+  - name: "Integration (:docker:, postings=default) %n"
     command: make clean install-vendor-m3 docker-integration-test
     parallelism: 2
     env:
@@ -52,6 +52,16 @@ steps:
         gopath-checkout#v1.0.1:
           import: github.com/m3db/m3
     <<: *common
+  - name: "Integration (:docker:, postings=read_only)  %n"
+    command: M3DB_READ_ONLY_POSTINGS=true make clean install-vendor-m3 docker-integration-test
+    parallelism: 2
+    env:
+      CGO_ENABLED: 0
+      GIMME_GO_VERSION: 1.13.x
+    plugins:
+        gopath-checkout#v1.0.1:
+          import: github.com/m3db/m3
+    <<: *common
   - name: "Prometheus compatibility (:docker:)"
     command: make clean install-vendor-m3 docker-compatibility-test
     parallelism: 1
@@ -62,17 +72,17 @@ steps:
         gopath-checkout#v1.0.1:
           import: github.com/m3db/m3
     <<: *common
-  - name: "Integration (dbnode Recently Read) %n"
+  - name: "Integration (dbnode cache=LRU postings=default) %n"
     parallelism: 2
-    command: make clean install-vendor-m3 test-ci-integration-dbnode cache_policy=recently_read
+    command: make clean install-vendor-m3 test-ci-integration-dbnode cache_policy=lru
     plugins:
       docker-compose#v2.5.1:
         run: app
         workdir: /go/src/github.com/m3db/m3
     <<: *common
-  - name: "Integration (dbnode LRU) %n"
+  - name: "Integration (dbnode cache=LRU postings=read_only) %n"
     parallelism: 2
-    command: make clean install-vendor-m3 test-ci-integration-dbnode cache_policy=lru
+    command: M3DB_READ_ONLY_POSTINGS=true make clean install-vendor-m3 test-ci-integration-dbnode cache_policy=lru
     plugins:
       docker-compose#v2.5.1:
         run: app
diff --git a/scripts/docker-integration-tests/aggregator/docker-compose.yml b/scripts/docker-integration-tests/aggregator/docker-compose.yml
index c93b41ee25..9c47ffc069 100644
--- a/scripts/docker-integration-tests/aggregator/docker-compose.yml
+++ b/scripts/docker-integration-tests/aggregator/docker-compose.yml
@@ -12,6 +12,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   m3coordinator01:
     expose:
       - "7202"
diff --git a/scripts/docker-integration-tests/aggregator_legacy/docker-compose.yml b/scripts/docker-integration-tests/aggregator_legacy/docker-compose.yml
index c93b41ee25..9c47ffc069 100644
--- a/scripts/docker-integration-tests/aggregator_legacy/docker-compose.yml
+++ b/scripts/docker-integration-tests/aggregator_legacy/docker-compose.yml
@@ -12,6 +12,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   m3coordinator01:
     expose:
       - "7202"
diff --git a/scripts/docker-integration-tests/carbon/docker-compose.yml b/scripts/docker-integration-tests/carbon/docker-compose.yml
index 53a28f0b88..8fa8fb9f02 100644
--- a/scripts/docker-integration-tests/carbon/docker-compose.yml
+++ b/scripts/docker-integration-tests/carbon/docker-compose.yml
@@ -10,6 +10,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator01:
     expose:
       - "7201"
diff --git a/scripts/docker-integration-tests/cold_writes_simple/docker-compose.yml b/scripts/docker-integration-tests/cold_writes_simple/docker-compose.yml
index 53a28f0b88..8fa8fb9f02 100644
--- a/scripts/docker-integration-tests/cold_writes_simple/docker-compose.yml
+++ b/scripts/docker-integration-tests/cold_writes_simple/docker-compose.yml
@@ -10,6 +10,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator01:
     expose:
       - "7201"
diff --git a/scripts/docker-integration-tests/coordinator_config_rules/docker-compose.yml b/scripts/docker-integration-tests/coordinator_config_rules/docker-compose.yml
index 53a28f0b88..8fa8fb9f02 100644
--- a/scripts/docker-integration-tests/coordinator_config_rules/docker-compose.yml
+++ b/scripts/docker-integration-tests/coordinator_config_rules/docker-compose.yml
@@ -10,6 +10,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator01:
     expose:
       - "7201"
diff --git a/scripts/docker-integration-tests/dedicated_etcd_embedded_coordinator/docker-compose.yml b/scripts/docker-integration-tests/dedicated_etcd_embedded_coordinator/docker-compose.yml
index e155eb5334..914681a26d 100644
--- a/scripts/docker-integration-tests/dedicated_etcd_embedded_coordinator/docker-compose.yml
+++ b/scripts/docker-integration-tests/dedicated_etcd_embedded_coordinator/docker-compose.yml
@@ -12,6 +12,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=dbnode01
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode.yml:/etc/m3dbnode/m3dbnode.yml"
   etcd01:
diff --git a/scripts/docker-integration-tests/multi_cluster_write/docker-compose.yml b/scripts/docker-integration-tests/multi_cluster_write/docker-compose.yml
index b5b52d922d..1893222dfd 100644
--- a/scripts/docker-integration-tests/multi_cluster_write/docker-compose.yml
+++ b/scripts/docker-integration-tests/multi_cluster_write/docker-compose.yml
@@ -12,6 +12,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_a_m3db_local_1
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-a.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_a_dbnode02:
@@ -26,6 +27,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_a_m3db_local_2
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-a.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_a_coordinator01:
@@ -54,6 +56,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_b_m3db_local_1
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-b.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_b_dbnode02:
@@ -68,6 +71,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_b_m3db_local_2
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-b.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_b_coordinator01:
diff --git a/scripts/docker-integration-tests/prometheus/docker-compose.yml b/scripts/docker-integration-tests/prometheus/docker-compose.yml
index 5516a35c14..e4f132be35 100644
--- a/scripts/docker-integration-tests/prometheus/docker-compose.yml
+++ b/scripts/docker-integration-tests/prometheus/docker-compose.yml
@@ -10,6 +10,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator01:
     expose:
       - "7201"
diff --git a/scripts/docker-integration-tests/prometheus_replication/docker-compose.yml b/scripts/docker-integration-tests/prometheus_replication/docker-compose.yml
index 333ee79238..04d8e551f7 100644
--- a/scripts/docker-integration-tests/prometheus_replication/docker-compose.yml
+++ b/scripts/docker-integration-tests/prometheus_replication/docker-compose.yml
@@ -10,6 +10,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator01:
     expose:
       - "7201"
@@ -32,6 +34,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator02:
     expose:
       - "17201"
diff --git a/scripts/docker-integration-tests/query_fanout/docker-compose.yml b/scripts/docker-integration-tests/query_fanout/docker-compose.yml
index c5cba80180..c3d3af4957 100644
--- a/scripts/docker-integration-tests/query_fanout/docker-compose.yml
+++ b/scripts/docker-integration-tests/query_fanout/docker-compose.yml
@@ -10,6 +10,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator-cluster-a:
     expose:
       - "7201"
@@ -34,6 +36,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator-cluster-b:
     expose:
       - "7201"
@@ -58,6 +62,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator-cluster-c:
     expose:
       - "7201"
diff --git a/scripts/docker-integration-tests/repair/docker-compose.yml b/scripts/docker-integration-tests/repair/docker-compose.yml
index b91284adcf..34b345b799 100644
--- a/scripts/docker-integration-tests/repair/docker-compose.yml
+++ b/scripts/docker-integration-tests/repair/docker-compose.yml
@@ -12,6 +12,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=m3db_local_1
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode.yml:/etc/m3dbnode/m3dbnode.yml"
   dbnode02:
@@ -26,6 +27,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=m3db_local_2
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode.yml:/etc/m3dbnode/m3dbnode.yml"
   coordinator01:
diff --git a/scripts/docker-integration-tests/repair_and_replication/docker-compose.yml b/scripts/docker-integration-tests/repair_and_replication/docker-compose.yml
index b5b52d922d..1893222dfd 100644
--- a/scripts/docker-integration-tests/repair_and_replication/docker-compose.yml
+++ b/scripts/docker-integration-tests/repair_and_replication/docker-compose.yml
@@ -12,6 +12,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_a_m3db_local_1
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-a.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_a_dbnode02:
@@ -26,6 +27,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_a_m3db_local_2
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-a.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_a_coordinator01:
@@ -54,6 +56,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_b_m3db_local_1
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-b.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_b_dbnode02:
@@ -68,6 +71,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_b_m3db_local_2
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-b.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_b_coordinator01:
diff --git a/scripts/docker-integration-tests/replication/docker-compose.yml b/scripts/docker-integration-tests/replication/docker-compose.yml
index b5b52d922d..1893222dfd 100644
--- a/scripts/docker-integration-tests/replication/docker-compose.yml
+++ b/scripts/docker-integration-tests/replication/docker-compose.yml
@@ -12,6 +12,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_a_m3db_local_1
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-a.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_a_dbnode02:
@@ -26,6 +27,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_a_m3db_local_2
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-a.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_a_coordinator01:
@@ -54,6 +56,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_b_m3db_local_1
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-b.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_b_dbnode02:
@@ -68,6 +71,7 @@ services:
     image: "m3dbnode_integration:${REVISION}"
     environment:
       - M3DB_HOST_ID=cluster_b_m3db_local_2
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
     volumes:
       - "./m3dbnode-cluster-b.yml:/etc/m3dbnode/m3dbnode.yml"
   cluster_b_coordinator01:
diff --git a/scripts/docker-integration-tests/simple_v2_batch_apis/docker-compose.yml b/scripts/docker-integration-tests/simple_v2_batch_apis/docker-compose.yml
index bce43d559e..c09c93c64b 100644
--- a/scripts/docker-integration-tests/simple_v2_batch_apis/docker-compose.yml
+++ b/scripts/docker-integration-tests/simple_v2_batch_apis/docker-compose.yml
@@ -10,6 +10,8 @@ services:
     networks:
       - backend
     image: "m3dbnode_integration:${REVISION}"
+    environment:
+      - M3DB_READ_ONLY_POSTINGS # Remove after MigrationReadOnlyPostings done
   coordinator01:
     expose:
       - "7201"
diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index ec96c8e44c..38cd632674 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -188,6 +188,14 @@ func Run(runOpts RunOptions) {
 
 	xconfig.WarnOnDeprecation(cfg, logger)
 
+	// Log whether or not migration read only posting lists are enabled.
+	// Note: will be removed once read only postings lists deemed stable.
+	if m3ninxindex.MigrationReadOnlyPostings() {
+		logger.Info("read only postings lists enabled")
+	} else {
+		logger.Info("read only postings lists disabled")
+	}
+
 	// By default attempt to raise process limits, which is a benign operation.
 	skipRaiseLimits := strings.TrimSpace(os.Getenv(skipRaiseProcessLimitsEnvVar))
 	if skipRaiseLimits != skipRaiseProcessLimitsEnvVarTrue {
diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index eb1bbad689..7d07fa4f3b 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -23,10 +23,12 @@ package index
 import (
 	"errors"
 
+	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	xerrors "github.com/m3db/m3/src/x/errors"
+	pilosaroaring "github.com/m3dbx/pilosa/roaring"
 )
 
 var (
@@ -73,6 +75,8 @@ type fieldsAndTermsIter struct {
 		postings postings.List
 	}
 
+	restrictByPostingsBitmap *pilosaroaring.Bitmap
+
 	restrictByPostings          postings.List
 	restrictByPostingsIntersect *roaring.ReadOnlyBitmapIntersectCheck
 }
@@ -88,9 +92,16 @@ type newFieldsAndTermsIteratorFn func(
 	r segment.Reader, opts fieldsAndTermsIteratorOpts,
 ) (fieldsAndTermsIterator, error)
 
-func newFieldsAndTermsIterator(reader segment.Reader, opts fieldsAndTermsIteratorOpts) (fieldsAndTermsIterator, error) {
+func newFieldsAndTermsIterator(
+	reader segment.Reader,
+	opts fieldsAndTermsIteratorOpts,
+) (fieldsAndTermsIterator, error) {
+	var restrictByPostingsIntersect *roaring.ReadOnlyBitmapIntersectCheck
+	if index.MigrationReadOnlyPostings() {
+		restrictByPostingsIntersect = roaring.NewReadOnlyBitmapIntersectCheck()
+	}
 	iter := &fieldsAndTermsIter{
-		restrictByPostingsIntersect: roaring.NewReadOnlyBitmapIntersectCheck(),
+		restrictByPostingsIntersect: restrictByPostingsIntersect,
 	}
 	err := iter.Reset(reader, opts)
 	if err != nil {
@@ -99,7 +110,10 @@ func newFieldsAndTermsIterator(reader segment.Reader, opts fieldsAndTermsIterato
 	return iter, nil
 }
 
-func (fti *fieldsAndTermsIter) Reset(reader segment.Reader, opts fieldsAndTermsIteratorOpts) error {
+func (fti *fieldsAndTermsIter) Reset(
+	reader segment.Reader,
+	opts fieldsAndTermsIteratorOpts,
+) error {
 	restrictByPostingsIntersect := fti.restrictByPostingsIntersect
 	*fti = fieldsAndTermsIterZeroed
 	fti.restrictByPostingsIntersect = restrictByPostingsIntersect
@@ -132,7 +146,15 @@ func (fti *fieldsAndTermsIter) Reset(reader segment.Reader, opts fieldsAndTermsI
 	}
 
 	// Hold onto the postings bitmap to intersect against on a per term basis.
-	fti.restrictByPostings = pl
+	if index.MigrationReadOnlyPostings() {
+		fti.restrictByPostings = pl
+	} else {
+		var ok bool
+		fti.restrictByPostingsBitmap, ok = roaring.BitmapFromPostingsList(pl)
+		if !ok {
+			return errUnpackBitmapFromPostingsList
+		}
+	}
 
 	return nil
 }
@@ -202,23 +224,45 @@ func (fti *fieldsAndTermsIter) setNext() bool {
 func (fti *fieldsAndTermsIter) nextTermsIterResult() (bool, error) {
 	for fti.termIter.Next() {
 		fti.current.term, fti.current.postings = fti.termIter.Current()
-		if fti.restrictByPostings == nil {
-			// No restrictions.
-			return true, nil
-		}
-
-		// Check term isn't part of at least some of the documents we're
-		// restricted to providing results for based on intersection
-		// count.
-		restrictBy := fti.restrictByPostings
-		curr := fti.current.postings
-		match, err := fti.restrictByPostingsIntersect.Intersects(restrictBy, curr)
-		if err != nil {
-			return false, err
-		}
-		if match {
-			// Matches, this is next result.
-			return true, nil
+		if index.MigrationReadOnlyPostings() {
+			if fti.restrictByPostings == nil {
+				// No restrictions.
+				return true, nil
+			}
+
+			// Check term isn't part of at least some of the documents we're
+			// restricted to providing results for based on intersection
+			// count.
+			restrictBy := fti.restrictByPostings
+			curr := fti.current.postings
+			match, err := fti.restrictByPostingsIntersect.Intersects(restrictBy, curr)
+			if err != nil {
+				return false, err
+			}
+			if match {
+				// Matches, this is next result.
+				return true, nil
+			}
+		} else {
+			if fti.restrictByPostingsBitmap == nil {
+				// No restrictions.
+				return true, nil
+			}
+
+			bitmap, ok := roaring.BitmapFromPostingsList(fti.current.postings)
+			if !ok {
+				return false, errUnpackBitmapFromPostingsList
+			}
+
+			// Check term isn part of at least some of the documents we're
+			// restricted to providing results for based on intersection
+			// count.
+			// Note: IntersectionCount is significantly faster than intersecting and
+			// counting results and also does not allocate.
+			if n := fti.restrictByPostingsBitmap.IntersectionCount(bitmap); n > 0 {
+				// Matches, this is next result.
+				return true, nil
+			}
 		}
 	}
 	if err := fti.termIter.Err(); err != nil {
diff --git a/src/m3ninx/postings/pilosa/iterator_test.go b/src/m3ninx/index/migration.go
similarity index 59%
rename from src/m3ninx/postings/pilosa/iterator_test.go
rename to src/m3ninx/index/migration.go
index d41d045e31..dcf4b948c5 100644
--- a/src/m3ninx/postings/pilosa/iterator_test.go
+++ b/src/m3ninx/index/migration.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Uber Technologies, Inc.
+// Copyright (c) 2020 Uber Technologies, Inc.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -18,28 +18,32 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-package pilosa
+package index
 
 import (
-	"testing"
+	"os"
+	"strings"
 
-	"github.com/m3db/m3/src/m3ninx/postings"
-	"github.com/m3dbx/pilosa/roaring"
-
-	"github.com/stretchr/testify/require"
+	"go.uber.org/atomic"
 )
 
-func TestIterator(t *testing.T) {
-	b := roaring.NewBitmap(1, 2, 4, 3)
-	iter := NewIterator(b.Iterator())
-	require.True(t, iter.Next())
-	require.Equal(t, postings.ID(1), iter.Current())
-	require.True(t, iter.Next())
-	require.Equal(t, postings.ID(2), iter.Current())
-	require.True(t, iter.Next())
-	require.Equal(t, postings.ID(3), iter.Current())
-	require.True(t, iter.Next())
-	require.Equal(t, postings.ID(4), iter.Current())
-	require.NoError(t, iter.Err())
-	require.NoError(t, iter.Close())
+func init() {
+	if strings.ToLower(os.Getenv("M3DB_READ_ONLY_POSTINGS")) == "true" {
+		// Once migration complete all code removed and unable to toggle this on.
+		SetMigrationReadOnlyPostings(true)
+	}
+}
+
+var migrationReadOnlyPostings = atomic.NewBool(false)
+
+// MigrationReadOnlyPostings returns whether the migration read only postings
+// execution is enabled or not.
+func MigrationReadOnlyPostings() bool {
+	return migrationReadOnlyPostings.Load()
+}
+
+// SetMigrationReadOnlyPostings sets whether the migration read only postings
+// execution is enabled or not.
+func SetMigrationReadOnlyPostings(v bool) {
+	migrationReadOnlyPostings.Store(v)
 }
diff --git a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
index a7b002f388..3d3330a554 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
@@ -149,8 +149,18 @@ func (i *multiKeyPostingsListIterator) Next() bool {
 
 		if fieldsKeyIter.segment.offset == 0 {
 			// No offset, which means is first segment we are combining from
-			// so can just direct union
-			i.currFieldPostingsList.Union(pl)
+			// so can just direct union.
+			if index.MigrationReadOnlyPostings() {
+				if err := i.currFieldPostingsList.AddIterator(pl.Iterator()); err != nil {
+					i.err = err
+					return false
+				}
+			} else {
+				if err := i.currFieldPostingsList.Union(pl); err != nil {
+					i.err = err
+					return false
+				}
+			}
 			continue
 		}
 
diff --git a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
index c4edeeedb2..225bc3b732 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
@@ -21,6 +21,7 @@
 package builder
 
 import (
+	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
@@ -137,8 +138,18 @@ func (i *termsIterFromSegments) Next() bool {
 
 		if termsKeyIter.segment.offset == 0 {
 			// No offset, which means is first segment we are combining from
-			// so can just direct union
-			i.currPostingsList.Union(list)
+			// so can just direct union.
+			if index.MigrationReadOnlyPostings() {
+				if err := i.currPostingsList.AddIterator(list.Iterator()); err != nil {
+					i.err = err
+					return false
+				}
+			} else {
+				if err := i.currPostingsList.Union(list); err != nil {
+					i.err = err
+					return false
+				}
+			}
 			continue
 		}
 
diff --git a/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go b/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
index 6cd4e1eaf5..c74a7cb15d 100644
--- a/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
+++ b/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
@@ -21,13 +21,28 @@
 package fst
 
 import (
+	"github.com/m3db/m3/src/m3ninx/index"
 	sgmt "github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
+	pilosaroaring "github.com/m3dbx/pilosa/roaring"
 )
 
+// postingsIterRoaringPoolingConfig uses a configuration that avoids allocating
+// any containers in the roaring bitmap, since these roaring bitmaps are backed
+// by mmaps and don't have any native containers themselves.
+var postingsIterRoaringPoolingConfig = pilosaroaring.ContainerPoolingConfiguration{
+	MaxArraySize:                    0,
+	MaxRunsSize:                     0,
+	AllocateBitmap:                  false,
+	MaxCapacity:                     0,
+	MaxKeysAndContainersSliceLength: 128 * 10,
+}
+
 type fstTermsPostingsIter struct {
-	bitmap *roaring.ReadOnlyBitmap
+	bitmap       *roaring.ReadOnlyBitmap
+	legacyBitmap *pilosaroaring.Bitmap
+	legacyList   postings.List
 
 	seg       *fsSegment
 	termsIter *fstTermsIter
@@ -36,8 +51,19 @@ type fstTermsPostingsIter struct {
 }
 
 func newFSTTermsPostingsIter() *fstTermsPostingsIter {
+	var (
+		readOnlyBitmap *roaring.ReadOnlyBitmap
+		legacyBitmap   *pilosaroaring.Bitmap
+	)
+	if index.MigrationReadOnlyPostings() {
+		readOnlyBitmap = &roaring.ReadOnlyBitmap{}
+	} else {
+		legacyBitmap = pilosaroaring.NewBitmapWithPooling(postingsIterRoaringPoolingConfig)
+	}
 	i := &fstTermsPostingsIter{
-		bitmap: &roaring.ReadOnlyBitmap{},
+		bitmap:       readOnlyBitmap,
+		legacyBitmap: legacyBitmap,
+		legacyList:   roaring.NewPostingsListFromBitmap(legacyBitmap),
 	}
 	i.clear()
 	return i
@@ -46,7 +72,11 @@ func newFSTTermsPostingsIter() *fstTermsPostingsIter {
 var _ sgmt.TermsIterator = &fstTermsPostingsIter{}
 
 func (f *fstTermsPostingsIter) clear() {
-	f.bitmap.Reset(nil)
+	if index.MigrationReadOnlyPostings() {
+		f.bitmap.Reset(nil)
+	} else {
+		f.legacyBitmap.Reset()
+	}
 	f.seg = nil
 	f.termsIter = nil
 	f.currTerm = nil
@@ -77,15 +107,23 @@ func (f *fstTermsPostingsIter) Next() bool {
 	currOffset := f.termsIter.CurrentOffset()
 
 	f.seg.RLock()
-	f.err = f.seg.unmarshalPostingsListBitmapNotClosedMaybeFinalizedWithLock(f.bitmap,
-		currOffset)
+	if index.MigrationReadOnlyPostings() {
+		f.err = f.seg.unmarshalReadOnlyBitmapNotClosedMaybeFinalizedWithLock(f.bitmap,
+			currOffset)
+	} else {
+		f.err = f.seg.unmarshalBitmapNotClosedMaybeFinalizedWithLock(f.legacyBitmap,
+			currOffset)
+	}
 	f.seg.RUnlock()
 
 	return f.err == nil
 }
 
 func (f *fstTermsPostingsIter) Current() ([]byte, postings.List) {
-	return f.currTerm, f.bitmap
+	if index.MigrationReadOnlyPostings() {
+		return f.currTerm, f.bitmap
+	}
+	return f.currTerm, f.legacyList
 }
 
 func (f *fstTermsPostingsIter) Err() error {
diff --git a/src/m3ninx/index/segment/fst/options.go b/src/m3ninx/index/segment/fst/options.go
index bb87bff757..275cd75922 100644
--- a/src/m3ninx/index/segment/fst/options.go
+++ b/src/m3ninx/index/segment/fst/options.go
@@ -22,6 +22,7 @@ package fst
 
 import (
 	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/x/bytes"
 	"github.com/m3db/m3/src/x/context"
 	"github.com/m3db/m3/src/x/instrument"
@@ -46,6 +47,12 @@ type Options interface {
 	// BytesPool returns the bytes pool.
 	BytesPool() pool.BytesPool
 
+	// SetPostingsListPool sets the postings list pool.
+	SetPostingsListPool(value postings.Pool) Options
+
+	// PostingsListPool returns the postings list pool.
+	PostingsListPool() postings.Pool
+
 	// SetContextPool sets the contextPool.
 	SetContextPool(value context.Pool) Options
 
@@ -78,6 +85,7 @@ func NewOptions() Options {
 		iopts:             instrument.NewOptions(),
 		bytesSliceArrPool: arrPool,
 		bytesPool:         bytesPool,
+		postingsPool:      postings.NewPool(nil, roaring.NewPostingsList),
 		// Use a zero pool, this should be overriden at config time.
 		contextPool: context.NewPool(context.NewOptions().
 			SetContextPoolOptions(pool.NewObjectPoolOptions().SetSize(0)).
@@ -105,6 +113,16 @@ func (o *opts) BytesPool() pool.BytesPool {
 	return o.bytesPool
 }
 
+func (o *opts) SetPostingsListPool(v postings.Pool) Options {
+	opts := *o
+	opts.postingsPool = v
+	return &opts
+}
+
+func (o *opts) PostingsListPool() postings.Pool {
+	return o.postingsPool
+}
+
 func (o *opts) SetContextPool(value context.Pool) Options {
 	opts := *o
 	opts.contextPool = value
diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index bd101b32ae..c5c9049c93 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -34,11 +34,13 @@ import (
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding"
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding/docs"
 	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/pilosa"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/x"
 	"github.com/m3db/m3/src/x/context"
 	xerrors "github.com/m3db/m3/src/x/errors"
 	"github.com/m3db/m3/src/x/mmap"
+	pilosaroaring "github.com/m3dbx/pilosa/roaring"
 
 	"github.com/m3dbx/vellum"
 )
@@ -385,7 +387,7 @@ func (i *termsIterable) termsNotClosedMaybeFinalizedWithRLock(
 	return i.postingsIter, nil
 }
 
-func (r *fsSegment) unmarshalPostingsListBitmapNotClosedMaybeFinalizedWithLock(b *roaring.ReadOnlyBitmap, offset uint64) error {
+func (r *fsSegment) unmarshalReadOnlyBitmapNotClosedMaybeFinalizedWithLock(b *roaring.ReadOnlyBitmap, offset uint64) error {
 	if r.finalized {
 		return errReaderFinalized
 	}
@@ -398,6 +400,19 @@ func (r *fsSegment) unmarshalPostingsListBitmapNotClosedMaybeFinalizedWithLock(b
 	return b.Reset(postingsBytes)
 }
 
+func (r *fsSegment) unmarshalBitmapNotClosedMaybeFinalizedWithLock(b *pilosaroaring.Bitmap, offset uint64) error {
+	if r.finalized {
+		return errReaderFinalized
+	}
+
+	postingsBytes, err := r.retrieveBytesWithRLock(r.data.PostingsData.Bytes, offset)
+	if err != nil {
+		return fmt.Errorf("unable to retrieve postings data: %v", err)
+	}
+
+	return b.UnmarshalBinary(postingsBytes)
+}
+
 func (r *fsSegment) matchFieldNotClosedMaybeFinalizedWithRLock(
 	field []byte,
 ) (postings.List, error) {
@@ -418,10 +433,13 @@ func (r *fsSegment) matchFieldNotClosedMaybeFinalizedWithRLock(
 	}
 	if !exists {
 		// i.e. we don't know anything about the term, so can early return an empty postings list
-		// NB(r): Important this is a read only bitmap since we perform
-		// operations on postings lists and expect them all to be read only
-		// postings lists.
-		return roaring.NewReadOnlyBitmap(nil)
+		if index.MigrationReadOnlyPostings() {
+			// NB(r): Important this is a read only bitmap since we perform
+			// operations on postings lists and expect them all to be read only
+			// postings lists.
+			return roaring.NewReadOnlyBitmap(nil)
+		}
+		return r.opts.PostingsListPool().Get(), nil
 	}
 
 	protoBytes, _, err := r.retrieveTermsBytesWithRLock(r.data.FSTTermsData.Bytes, termsFSTOffset)
@@ -454,10 +472,13 @@ func (r *fsSegment) matchTermNotClosedMaybeFinalizedWithRLock(
 
 	if !exists {
 		// i.e. we don't know anything about the field, so can early return an empty postings list
-		// NB(r): Important this is a read only bitmap since we perform
-		// operations on postings lists and expect them all to be read only
-		// postings lists.
-		return roaring.NewReadOnlyBitmap(nil)
+		if index.MigrationReadOnlyPostings() {
+			// NB(r): Important this is a read only bitmap since we perform
+			// operations on postings lists and expect them all to be read only
+			// postings lists.
+			return roaring.NewReadOnlyBitmap(nil)
+		}
+		return r.opts.PostingsListPool().Get(), nil
 	}
 
 	fstCloser := x.NewSafeCloser(termsFST)
@@ -470,10 +491,13 @@ func (r *fsSegment) matchTermNotClosedMaybeFinalizedWithRLock(
 
 	if !exists {
 		// i.e. we don't know anything about the term, so can early return an empty postings list
-		// NB(r): Important this is a read only bitmap since we perform
-		// operations on postings lists and expect them all to be read only
-		// postings lists.
-		return roaring.NewReadOnlyBitmap(nil)
+		if index.MigrationReadOnlyPostings() {
+			// NB(r): Important this is a read only bitmap since we perform
+			// operations on postings lists and expect them all to be read only
+			// postings lists.
+			return roaring.NewReadOnlyBitmap(nil)
+		}
+		return r.opts.PostingsListPool().Get(), nil
 	}
 
 	pl, err := r.retrievePostingsListWithRLock(postingsOffset)
@@ -510,10 +534,13 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 
 	if !exists {
 		// i.e. we don't know anything about the field, so can early return an empty postings list
-		// NB(r): Important this is a read only bitmap since we perform
-		// operations on postings lists and expect them all to be read only
-		// postings lists.
-		return roaring.NewReadOnlyBitmap(nil)
+		if index.MigrationReadOnlyPostings() {
+			// NB(r): Important this is a read only bitmap since we perform
+			// operations on postings lists and expect them all to be read only
+			// postings lists.
+			return roaring.NewReadOnlyBitmap(nil)
+		}
+		return r.opts.PostingsListPool().Get(), nil
 	}
 
 	var (
@@ -546,9 +573,13 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 		iterErr = iter.Next()
 	}
 
-	// NB(r): Can use union read only since we are guaranteed all
-	// postings lists are read only.
-	pl, err := roaring.UnionReadOnly(pls)
+	var pl postings.List
+	if index.MigrationReadOnlyPostings() {
+		// Perform a lazy fast union.
+		pl, err = roaring.UnionReadOnly(pls)
+	} else {
+		pl, err = roaring.Union(pls)
+	}
 	if err != nil {
 		return nil, err
 	}
@@ -571,10 +602,20 @@ func (r *fsSegment) matchAllNotClosedMaybeFinalizedWithRLock() (postings.List, e
 		return nil, errReaderFinalized
 	}
 
-	// NB(r): Important this is a read only bitmap since we perform
-	// operations on postings lists and expect them all to be read only
-	// postings lists.
-	return roaring.NewReadOnlyRangePostingsList(0, uint64(r.numDocs))
+	if index.MigrationReadOnlyPostings() {
+		// NB(r): Important this is a read only postings since we perform
+		// operations on postings lists and expect them all to be read only
+		// postings lists.
+		return roaring.NewReadOnlyRangePostingsList(0, uint64(r.numDocs))
+	}
+
+	pl := r.opts.PostingsListPool().Get()
+	err := pl.AddRange(0, postings.ID(r.numDocs))
+	if err != nil {
+		return nil, err
+	}
+
+	return pl, nil
 }
 
 func (r *fsSegment) docNotClosedMaybeFinalizedWithRLock(id postings.ID) (doc.Document, error) {
@@ -628,8 +669,13 @@ func (r *fsSegment) retrievePostingsListWithRLock(postingsOffset uint64) (postin
 	if err != nil {
 		return nil, fmt.Errorf("unable to retrieve postings data: %v", err)
 	}
-	// Read only bitmap is a very low allocation postings list.
-	return roaring.NewReadOnlyBitmap(postingsBytes)
+
+	if index.MigrationReadOnlyPostings() {
+		// Read only bitmap is a very low allocation postings list.
+		return roaring.NewReadOnlyBitmap(postingsBytes)
+	}
+
+	return pilosa.Unmarshal(postingsBytes)
 }
 
 func (r *fsSegment) retrieveTermsFSTWithRLock(field []byte) (*vellum.FST, bool, error) {
diff --git a/src/m3ninx/index/segment/mem/concurrent_postings_map_test.go b/src/m3ninx/index/segment/mem/concurrent_postings_map_test.go
index 6f18bf228a..106f83b99e 100644
--- a/src/m3ninx/index/segment/mem/concurrent_postings_map_test.go
+++ b/src/m3ninx/index/segment/mem/concurrent_postings_map_test.go
@@ -41,7 +41,7 @@ func TestConcurrentPostingsMap(t *testing.T) {
 
 	pl, ok := pm.Get([]byte("foo"))
 	require.True(t, ok)
-	require.Equal(t, 2, pl.Len())
+	require.Equal(t, 2, pl.CountSlow())
 	require.True(t, pl.Contains(1))
 	require.True(t, pl.Contains(3))
 
@@ -51,7 +51,7 @@ func TestConcurrentPostingsMap(t *testing.T) {
 	re := regexp.MustCompile("ba.*")
 	pl, ok = pm.GetRegex(re)
 	require.True(t, ok)
-	require.Equal(t, 2, pl.Len())
+	require.Equal(t, 2, pl.CountSlow())
 	require.True(t, pl.Contains(2))
 	require.True(t, pl.Contains(4))
 
diff --git a/src/m3ninx/index/segment/mem/segment.go b/src/m3ninx/index/segment/mem/segment.go
index 9a45d1b2ec..6b726f9827 100644
--- a/src/m3ninx/index/segment/mem/segment.go
+++ b/src/m3ninx/index/segment/mem/segment.go
@@ -466,7 +466,7 @@ func (s *memSegment) FieldsPostingsList() (segment.FieldsPostingsListIterator, e
 	if err := s.checkIsSealedWithRLock(); err != nil {
 		return nil, err
 	}
-	return s.termsDict.FieldsPostingsList(), nil
+	return s.termsDict.FieldsPostingsList()
 }
 
 func (s *memSegment) Terms(name []byte) (segment.TermsIterator, error) {
diff --git a/src/m3ninx/index/segment/mem/terms_dict.go b/src/m3ninx/index/segment/mem/terms_dict.go
index fc2f7087c1..94b3f4289a 100644
--- a/src/m3ninx/index/segment/mem/terms_dict.go
+++ b/src/m3ninx/index/segment/mem/terms_dict.go
@@ -87,7 +87,7 @@ func (d *termsDict) Fields() sgmt.FieldsIterator {
 	return newBytesSliceIter(fields, d.opts)
 }
 
-func (d *termsDict) FieldsPostingsList() sgmt.FieldsPostingsListIterator {
+func (d *termsDict) FieldsPostingsList() (sgmt.FieldsPostingsListIterator, error) {
 	d.fields.RLock()
 	defer d.fields.RUnlock()
 	// NB(bodu): This is probably fine since the terms dict/mem segment is only used in tests.
@@ -101,13 +101,15 @@ func (d *termsDict) FieldsPostingsList() sgmt.FieldsPostingsListIterator {
 				d.currFieldsPostingsLists = append(d.currFieldsPostingsLists, entry.value)
 			}
 		}
-		pl.UnionMany(d.currFieldsPostingsLists)
+		if err := pl.UnionMany(d.currFieldsPostingsLists); err != nil {
+			return nil, err
+		}
 		fields = append(fields, uniqueField{
 			field:        field,
 			postingsList: pl,
 		})
 	}
-	return newUniqueFieldsIter(fields, d.opts)
+	return newUniqueFieldsIter(fields, d.opts), nil
 }
 
 func (d *termsDict) Terms(field []byte) sgmt.TermsIterator {
diff --git a/src/m3ninx/index/segment/mem/terms_dict_test.go b/src/m3ninx/index/segment/mem/terms_dict_test.go
index c1ec4e2f40..5670ad5b65 100644
--- a/src/m3ninx/index/segment/mem/terms_dict_test.go
+++ b/src/m3ninx/index/segment/mem/terms_dict_test.go
@@ -232,7 +232,7 @@ func (t *termsDictionaryTestSuite) TestMatchTermNoResults() {
 				if pl == nil {
 					return false, fmt.Errorf("postings list returned should not be nil")
 				}
-				if pl.Len() != 0 {
+				if pl.CountSlow() != 0 {
 					return false, fmt.Errorf("postings list contains unexpected IDs")
 				}
 
@@ -288,7 +288,7 @@ func (t *termsDictionaryTestSuite) TestMatchRegexNoResults() {
 				if pl == nil {
 					return false, fmt.Errorf("postings list returned should not be nil")
 				}
-				if pl.Len() != 0 {
+				if pl.CountSlow() != 0 {
 					return false, fmt.Errorf("postings list contains unexpected IDs")
 				}
 
diff --git a/src/m3ninx/index/segment/mem/types.go b/src/m3ninx/index/segment/mem/types.go
index 13bbea45fc..f1661eab4e 100644
--- a/src/m3ninx/index/segment/mem/types.go
+++ b/src/m3ninx/index/segment/mem/types.go
@@ -53,7 +53,7 @@ type termsDictionary interface {
 	Fields() sgmt.FieldsIterator
 
 	// Fields returns the known fields.
-	FieldsPostingsList() sgmt.FieldsPostingsListIterator
+	FieldsPostingsList() (sgmt.FieldsPostingsListIterator, error)
 
 	// Terms returns the known terms values for the given field.
 	Terms(field []byte) sgmt.TermsIterator
diff --git a/src/m3ninx/postings/pilosa/codec.go b/src/m3ninx/postings/pilosa/codec.go
index 8c8998f7b1..7e06f2c06f 100644
--- a/src/m3ninx/postings/pilosa/codec.go
+++ b/src/m3ninx/postings/pilosa/codec.go
@@ -22,12 +22,15 @@ package pilosa
 
 import (
 	"bytes"
+	"errors"
 
 	"github.com/m3db/m3/src/m3ninx/postings"
 	idxroaring "github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3dbx/pilosa/roaring"
 )
 
+var errNotPilosaRoaringBitmap = errors.New("not pilosa roaring bitmap")
+
 // Encoder helps serialize a Pilosa RoaringBitmap
 type Encoder struct {
 	scratchBuffer bytes.Buffer
@@ -50,14 +53,12 @@ func (e *Encoder) Reset() {
 func (e *Encoder) Encode(pl postings.List) ([]byte, error) {
 	e.scratchBuffer.Reset()
 
-	// Optimistically try to see if we can extract from the postings list itself
+	// Only work with pilosa roaring bitmaps since any other format
+	// will cause large allocations to re-encode as a pilosa postings list
+	// before writing it out.
 	bitmap, ok := idxroaring.BitmapFromPostingsList(pl)
 	if !ok {
-		var err error
-		bitmap, err = toPilosa(pl)
-		if err != nil {
-			return nil, err
-		}
+		return nil, errNotPilosaRoaringBitmap
 	}
 
 	if _, err := bitmap.WriteTo(&e.scratchBuffer); err != nil {
@@ -67,24 +68,6 @@ func (e *Encoder) Encode(pl postings.List) ([]byte, error) {
 	return e.scratchBuffer.Bytes(), nil
 }
 
-func toPilosa(pl postings.List) (*roaring.Bitmap, error) {
-	bitmap := roaring.NewBitmap()
-	iter := pl.Iterator()
-
-	for iter.Next() {
-		_, err := bitmap.Add(uint64(iter.Current()))
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	if err := iter.Err(); err != nil {
-		return nil, err
-	}
-
-	return bitmap, nil
-}
-
 // Unmarshal unmarshals the provided bytes into a postings.List.
 func Unmarshal(data []byte) (postings.List, error) {
 	bitmap := roaring.NewBitmap()
diff --git a/src/m3ninx/postings/pilosa/iterator.go b/src/m3ninx/postings/pilosa/iterator.go
deleted file mode 100644
index eb5e06fba5..0000000000
--- a/src/m3ninx/postings/pilosa/iterator.go
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2018 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-package pilosa
-
-import (
-	"github.com/m3db/m3/src/m3ninx/postings"
-	"github.com/m3dbx/pilosa/roaring"
-)
-
-// NB: need to do this to find a path into our postings list which doesn't require every
-// insert to grab a lock. Need to make a non thread-safe version of our api.
-// FOLLOWUP(prateek): tracking this issue in https://github.com/m3db/m3ninx/issues/65
-
-type iterator struct {
-	iter    *roaring.Iterator
-	current uint64
-	hasNext bool
-}
-
-var _ postings.Iterator = &iterator{}
-
-// NewIterator returns a postings.Iterator wrapping a pilosa roaring.Iterator.
-func NewIterator(iter *roaring.Iterator) postings.Iterator {
-	return &iterator{
-		iter:    iter,
-		hasNext: true,
-	}
-}
-
-func (p *iterator) Next() bool {
-	if !p.hasNext {
-		return false
-	}
-	v, eof := p.iter.Next()
-	p.current = v
-	p.hasNext = !eof
-	return p.hasNext
-}
-
-func (p *iterator) Current() postings.ID {
-	return postings.ID(p.current)
-}
-
-func (p *iterator) Err() error {
-	return nil
-}
-
-func (p *iterator) Close() error {
-	p.iter = nil
-	p.hasNext = false
-	return nil
-}
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index bdd694b02a..6c2254a1ad 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -343,7 +343,7 @@ func appendContainerItersWithOp(
 }
 
 func (i *multiBitmapIterator) Next() bool {
-	if i.err != nil {
+	if i.err != nil || len(i.iters) == 0 {
 		return false
 	}
 
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
index 2f19c7ef23..970ad84337 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
@@ -249,6 +249,29 @@ func TestMultiBitmap(t *testing.T) {
 	}
 }
 
+func TestMultiBitmapWithEmptyReadOnlyBitmap(t *testing.T) {
+	bitmap := roaring.NewBitmap()
+	bitmap.DirectAdd(1)
+	bitmap.DirectAdd(3)
+	bitmap.DirectAdd(5)
+
+	readOnly := newReadOnlyBitmap(t, bitmap)
+
+	emptyReadOnly, err := NewReadOnlyBitmap(nil)
+	require.NoError(t, err)
+
+	for _, lists := range [][]postings.List{
+		[]postings.List{readOnly, emptyReadOnly},
+		[]postings.List{emptyReadOnly, readOnly},
+	} {
+		multi, err := IntersectAndNegateReadOnly(lists, nil)
+		require.NoError(t, err)
+
+		emptyRegular := NewPostingsList()
+		require.True(t, postings.Equal(emptyRegular, multi))
+	}
+}
+
 func bitmapFromPostings(t *testing.T, pl postings.List) *roaring.Bitmap {
 	b, ok := BitmapFromPostingsList(pl)
 	require.True(t, ok)
@@ -303,17 +326,7 @@ func genRandBitmapAndReadOnlyBitmap(
 	}
 
 	list := NewPostingsListFromBitmap(bitmap)
-
-	// Note: do not reuse buffer since read only bitmap
-	// references them.
-	buff := bytes.NewBuffer(nil)
-	_, err := bitmap.WriteTo(buff)
-	require.NoError(t, err)
-
-	readOnly, err := NewReadOnlyBitmap(buff.Bytes())
-	require.NoError(t, err)
-
-	return list, readOnly
+	return list, newReadOnlyBitmap(t, bitmap)
 }
 
 func postingsString(pl postings.List) string {
@@ -340,3 +353,16 @@ func postingsJSON(t *testing.T, pl postings.List) string {
 	require.NoError(t, err)
 	return string(data)
 }
+
+func newReadOnlyBitmap(t *testing.T, b *roaring.Bitmap) *ReadOnlyBitmap {
+	// Note: do not reuse buffer since read only bitmap
+	// references them.
+	buff := bytes.NewBuffer(nil)
+	_, err := b.WriteTo(buff)
+	require.NoError(t, err)
+
+	readOnly, err := NewReadOnlyBitmap(buff.Bytes())
+	require.NoError(t, err)
+
+	return readOnly
+}
diff --git a/src/m3ninx/postings/roaring/roaring.go b/src/m3ninx/postings/roaring/roaring.go
index 714bb62729..b7c0d8d509 100644
--- a/src/m3ninx/postings/roaring/roaring.go
+++ b/src/m3ninx/postings/roaring/roaring.go
@@ -49,6 +49,16 @@ func Union(inputs []postings.List) (postings.MutableList, error) {
 	return NewPostingsListFromBitmap(unioned), nil
 }
 
+// UnionInPlace unions in place a postings list with other inputs.
+func UnionInPlace(first postings.List, inputs []postings.List) error {
+	b, ok := BitmapFromPostingsList(first)
+	if !ok {
+		return errUnionRoaringOnly
+	}
+
+	return union(b, inputs)
+}
+
 func union(unionedBitmap *roaring.Bitmap, inputs []postings.List) error {
 	bitmaps := make([]*roaring.Bitmap, 0, len(inputs))
 	for _, in := range inputs {
diff --git a/src/m3ninx/search/searcher/conjunction.go b/src/m3ninx/search/searcher/conjunction.go
index 9719900a4c..42cf8793d2 100644
--- a/src/m3ninx/search/searcher/conjunction.go
+++ b/src/m3ninx/search/searcher/conjunction.go
@@ -21,6 +21,8 @@
 package searcher
 
 import (
+	"fmt"
+
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
@@ -68,7 +70,29 @@ func (s *conjunctionSearcher) Search(r index.Reader) (postings.List, error) {
 		negations = append(negations, pl)
 	}
 
-	// Perform a lazy fast intersect and negate.
-	// TODO: Try and see if returns err, if so fallback to slower method?
-	return roaring.IntersectAndNegateReadOnly(intersects, negations)
+	if index.MigrationReadOnlyPostings() {
+		// Perform a lazy fast intersect and negate.
+		return roaring.IntersectAndNegateReadOnly(intersects, negations)
+	}
+
+	// Not running migration path, fallback.
+	first, ok := intersects[0].(postings.MutableList)
+	if !ok {
+		// Note not creating a "errNotMutable" like error since this path
+		// will be deprecated and we might forget to cleanup the err var.
+		return nil, fmt.Errorf("postings list for non-migration path not mutable")
+	}
+
+	result := first.Clone()
+	for i := 1; i < len(intersects); i++ {
+		if err := result.Intersect(intersects[i]); err != nil {
+			return nil, err
+		}
+	}
+	for i := 0; i < len(negations); i++ {
+		if err := result.Difference(negations[i]); err != nil {
+			return nil, err
+		}
+	}
+	return result, nil
 }
diff --git a/src/m3ninx/search/searcher/conjunction_test.go b/src/m3ninx/search/searcher/conjunction_test.go
index 32132ae8b5..7dbe234b99 100644
--- a/src/m3ninx/search/searcher/conjunction_test.go
+++ b/src/m3ninx/search/searcher/conjunction_test.go
@@ -33,7 +33,85 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+// TestConjunctionSearcherMigrationReadOnlyRemove stays until
+// migration removed (make sure to use "MigrationReadOnly" as
+// part of the string).
+// MigrationReadOnly: remove this when done.
+func TestConjunctionSearcherMigrationReadOnlyRemove(t *testing.T) {
+	mockCtrl := gomock.NewController(t)
+	defer mockCtrl.Finish()
+
+	firstReader := index.NewMockReader(mockCtrl)
+	secondReader := index.NewMockReader(mockCtrl)
+
+	// First searcher.
+	firstPL1 := roaring.NewPostingsList()
+	require.NoError(t, firstPL1.Insert(postings.ID(42)))
+	require.NoError(t, firstPL1.Insert(postings.ID(50)))
+	firstPL2 := roaring.NewPostingsList()
+	require.NoError(t, firstPL2.Insert(postings.ID(64)))
+	firstSearcher := search.NewMockSearcher(mockCtrl)
+
+	// Second searcher.
+	secondPL1 := roaring.NewPostingsList()
+	require.NoError(t, secondPL1.Insert(postings.ID(53)))
+	require.NoError(t, secondPL1.Insert(postings.ID(50)))
+	secondPL2 := roaring.NewPostingsList()
+	require.NoError(t, secondPL2.Insert(postings.ID(64)))
+	require.NoError(t, secondPL2.Insert(postings.ID(72)))
+	secondSearcher := search.NewMockSearcher(mockCtrl)
+
+	// Third searcher.
+	thirdPL1 := roaring.NewPostingsList()
+	require.NoError(t, thirdPL1.Insert(postings.ID(42)))
+	require.NoError(t, thirdPL1.Insert(postings.ID(53)))
+	thirdPL2 := roaring.NewPostingsList()
+	require.NoError(t, thirdPL2.Insert(postings.ID(64)))
+	require.NoError(t, thirdPL2.Insert(postings.ID(89)))
+	thirdSearcher := search.NewMockSearcher(mockCtrl)
+
+	gomock.InOrder(
+		// Get the postings lists for the first Reader.
+		firstSearcher.EXPECT().Search(firstReader).Return(firstPL1, nil),
+		secondSearcher.EXPECT().Search(firstReader).Return(secondPL1, nil),
+		thirdSearcher.EXPECT().Search(firstReader).Return(thirdPL1, nil),
+
+		// Get the postings lists for the second Reader.
+		firstSearcher.EXPECT().Search(secondReader).Return(firstPL2, nil),
+		secondSearcher.EXPECT().Search(secondReader).Return(secondPL2, nil),
+		thirdSearcher.EXPECT().Search(secondReader).Return(thirdPL2, nil),
+	)
+
+	var (
+		searchers = []search.Searcher{firstSearcher, secondSearcher}
+		negations = []search.Searcher{thirdSearcher}
+	)
+
+	s, err := NewConjunctionSearcher(searchers, negations)
+	require.NoError(t, err)
+
+	// Test the postings list from the first Reader.
+	expected := firstPL1.Clone()
+	expected.Intersect(secondPL1)
+	expected.Difference(thirdPL1)
+	pl, err := s.Search(firstReader)
+	require.NoError(t, err)
+	require.True(t, pl.Equal(expected))
+
+	// Test the postings list from the second Reader.
+	expected = firstPL2.Clone()
+	expected.Intersect(secondPL2)
+	expected.Difference(thirdPL2)
+	pl, err = s.Search(secondReader)
+	require.NoError(t, err)
+	require.True(t, pl.Equal(expected))
+}
+
 func TestConjunctionSearcher(t *testing.T) {
+	// MigrationReadOnly: remove the special casing to turn readonly on.
+	index.SetMigrationReadOnlyPostings(true)
+	defer index.SetMigrationReadOnlyPostings(false)
+
 	mockCtrl := gomock.NewController(t)
 	defer mockCtrl.Finish()
 
diff --git a/src/m3ninx/search/searcher/disjunction.go b/src/m3ninx/search/searcher/disjunction.go
index d539b58dce..8b909d537b 100644
--- a/src/m3ninx/search/searcher/disjunction.go
+++ b/src/m3ninx/search/searcher/disjunction.go
@@ -21,6 +21,8 @@
 package searcher
 
 import (
+	"fmt"
+
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
@@ -44,9 +46,7 @@ func NewDisjunctionSearcher(searchers search.Searchers) (search.Searcher, error)
 }
 
 func (s *disjunctionSearcher) Search(r index.Reader) (postings.List, error) {
-	var (
-		union = make([]postings.List, 0, len(s.searchers))
-	)
+	union := make([]postings.List, 0, len(s.searchers))
 	for _, sr := range s.searchers {
 		pl, err := sr.Search(r)
 		if err != nil {
@@ -55,8 +55,27 @@ func (s *disjunctionSearcher) Search(r index.Reader) (postings.List, error) {
 
 		union = append(union, pl)
 	}
+	if len(union) == 1 {
+		return union[0], nil
+	}
+
+	if index.MigrationReadOnlyPostings() {
+		// Perform a lazy fast union.
+		return roaring.UnionReadOnly(union)
+	}
+
+	// Not running migration path, fallback.
+	first, ok := union[0].(postings.MutableList)
+	if !ok {
+		// Note not creating a "errNotMutable" like error since this path
+		// will be deprecated and we might forget to cleanup the err var.
+		return nil, fmt.Errorf("postings list for non-migration path not mutable")
+	}
+
+	result := first.Clone()
+	if err := roaring.UnionInPlace(result, union[1:]); err != nil {
+		return nil, err
+	}
 
-	// Perform a lazy fast union.
-	// TODO: Try and see if returns err, if so fallback to slower method?
-	return roaring.UnionReadOnly(union)
+	return result, nil
 }
diff --git a/src/m3ninx/search/searcher/empty.go b/src/m3ninx/search/searcher/empty.go
index edcab8f936..2953844141 100644
--- a/src/m3ninx/search/searcher/empty.go
+++ b/src/m3ninx/search/searcher/empty.go
@@ -36,5 +36,8 @@ func NewEmptySearcher() search.Searcher {
 }
 
 func (s *emptySearcher) Search(r index.Reader) (postings.List, error) {
-	return roaring.NewReadOnlyBitmap(nil)
+	if index.MigrationReadOnlyPostings() {
+		return roaring.NewReadOnlyBitmap(nil)
+	}
+	return roaring.NewPostingsList(), nil
 }
diff --git a/src/m3ninx/search/searcher/negation.go b/src/m3ninx/search/searcher/negation.go
index 62c5c191c1..2367973127 100644
--- a/src/m3ninx/search/searcher/negation.go
+++ b/src/m3ninx/search/searcher/negation.go
@@ -21,6 +21,8 @@
 package searcher
 
 import (
+	"fmt"
+
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
@@ -50,9 +52,24 @@ func (s *negationSearcher) Search(r index.Reader) (postings.List, error) {
 		return nil, err
 	}
 
-	// Perform a lazy fast intersect and negate.
-	// TODO: Try and see if returns err, if so fallback to slower method?
-	intersects := []postings.List{pl}
-	negations := []postings.List{negatePl}
-	return roaring.IntersectAndNegateReadOnly(intersects, negations)
+	if index.MigrationReadOnlyPostings() {
+		// Perform a lazy fast intersect and negate.
+		intersects := []postings.List{pl}
+		negations := []postings.List{negatePl}
+		return roaring.IntersectAndNegateReadOnly(intersects, negations)
+	}
+
+	// Not running migration path, fallback.
+	mutable, ok := pl.(postings.MutableList)
+	if !ok {
+		// Note not creating a "errNotMutable" like error since this path
+		// will be deprecated and we might forget to cleanup the err var.
+		return nil, fmt.Errorf("postings list for non-migration path not mutable")
+	}
+
+	result := mutable.Clone()
+	if err := result.Difference(negatePl); err != nil {
+		return nil, err
+	}
+	return result, nil
 }

From 861fcdf5fd67fd70aba47578a5d24a14785b6ab4 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 3 Nov 2020 00:28:52 -0500
Subject: [PATCH 014/106] Fix integration test kickoff

---
 .buildkite/pipeline.yml | 4 ++--
 Makefile                | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index de3fe8b3bf..42d48a6ecf 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -74,7 +74,7 @@ steps:
     <<: *common
   - name: "Integration (dbnode cache=LRU postings=default) %n"
     parallelism: 2
-    command: make clean install-vendor-m3 test-ci-integration-dbnode cache_policy=lru
+    command: make clean install-vendor-m3 test-ci-integration-dbnode cache_policy=lru m3db_postings_ro=false
     plugins:
       docker-compose#v2.5.1:
         run: app
@@ -82,7 +82,7 @@ steps:
     <<: *common
   - name: "Integration (dbnode cache=LRU postings=read_only) %n"
     parallelism: 2
-    command: M3DB_READ_ONLY_POSTINGS=true make clean install-vendor-m3 test-ci-integration-dbnode cache_policy=lru
+    command: make clean install-vendor-m3 test-ci-integration-dbnode cache_policy=lru m3db_postings_ro=true
     plugins:
       docker-compose#v2.5.1:
         run: app
diff --git a/Makefile b/Makefile
index e208a0ab93..f20be26b42 100644
--- a/Makefile
+++ b/Makefile
@@ -34,6 +34,7 @@ thrift_rules_dir     := generated/thrift
 vendor_prefix        := vendor
 cache_policy         ?= recently_read
 genny_target         ?= genny-all
+m3db_postings_ro     ?= false # Remove after MigrationReadOnlyPostings done
 
 BUILD                     := $(abspath ./bin)
 VENDOR                    := $(m3_package_path)/$(vendor_prefix)
@@ -369,7 +370,7 @@ test-ci-big-unit-$(SUBDIR):
 .PHONY: test-ci-integration-$(SUBDIR)
 test-ci-integration-$(SUBDIR):
 	@echo "--- test-ci-integration $(SUBDIR)"
-	SRC_ROOT=./src/$(SUBDIR) PANIC_ON_INVARIANT_VIOLATED=true INTEGRATION_TIMEOUT=10m TEST_SERIES_CACHE_POLICY=$(cache_policy) make test-base-ci-integration
+	SRC_ROOT=./src/$(SUBDIR) PANIC_ON_INVARIANT_VIOLATED=true INTEGRATION_TIMEOUT=10m TEST_SERIES_CACHE_POLICY=$(cache_policy) M3DB_READ_ONLY_POSTINGS=$(m3db_postings_ro) make test-base-ci-integration
 	@echo "--- uploading coverage report"
 	$(codecov_push) -f $(coverfile) -F $(SUBDIR)
 

From 6396865ee1e03ca78ab23f56465f5c648dee0788 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 3 Nov 2020 01:13:08 -0500
Subject: [PATCH 015/106] Fix multi bitmap iterator and also implement field
 filtering for aggregate queries

---
 src/dbnode/storage/index/block.go             |  4 +-
 .../storage/index/fields_terms_iterator.go    | 43 ++++++++--
 .../storage/index/filter_fields_iterator.go   | 86 +++++++++++--------
 .../storage/index/read_through_segment.go     |  5 ++
 src/m3ninx/index/segment/fst/segment.go       | 29 +++++++
 src/m3ninx/index/segment/mem/reader.go        |  8 +-
 src/m3ninx/index/segment/types.go             |  1 +
 .../postings/roaring/bitmap_multi_readonly.go |  2 +-
 8 files changed, 133 insertions(+), 45 deletions(-)

diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index f1f0e80b9f..b35a035cb5 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -615,7 +615,7 @@ func (b *block) aggregateWithSpan(
 			}
 			return aggOpts.FieldFilter.Allow(field)
 		},
-		fieldIterFn: func(r segment.Reader) (segment.FieldsIterator, error) {
+		fieldIterFn: func(r segment.Reader) (segment.FieldsPostingsListIterator, error) {
 			// NB(prateek): we default to using the regular (FST) fields iterator
 			// unless we have a predefined list of fields we know we need to restrict
 			// our search to, in which case we iterate that list and check if known values
@@ -627,7 +627,7 @@ func (b *block) aggregateWithSpan(
 			// to this function is expected to have (FieldsFilter) pretty small. If that changes
 			// in the future, we can revisit this.
 			if len(aggOpts.FieldFilter) == 0 {
-				return r.Fields()
+				return r.FieldsPostingsList()
 			}
 			return newFilterFieldsIterator(r, aggOpts.FieldFilter)
 		},
diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index 7d07fa4f3b..adc9927f5a 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -50,23 +50,23 @@ func (o fieldsAndTermsIteratorOpts) allow(f []byte) bool {
 	return o.allowFn(f)
 }
 
-func (o fieldsAndTermsIteratorOpts) newFieldIter(r segment.Reader) (segment.FieldsIterator, error) {
+func (o fieldsAndTermsIteratorOpts) newFieldIter(r segment.Reader) (segment.FieldsPostingsListIterator, error) {
 	if o.fieldIterFn == nil {
-		return r.Fields()
+		return r.FieldsPostingsList()
 	}
 	return o.fieldIterFn(r)
 }
 
 type allowFn func(field []byte) bool
 
-type newFieldIterFn func(r segment.Reader) (segment.FieldsIterator, error)
+type newFieldIterFn func(r segment.Reader) (segment.FieldsPostingsListIterator, error)
 
 type fieldsAndTermsIter struct {
 	reader segment.Reader
 	opts   fieldsAndTermsIteratorOpts
 
 	err       error
-	fieldIter segment.FieldsIterator
+	fieldIter segment.FieldsPostingsListIterator
 	termIter  segment.TermsIterator
 
 	current struct {
@@ -166,10 +166,43 @@ func (fti *fieldsAndTermsIter) setNextField() bool {
 	}
 
 	for fieldIter.Next() {
-		field := fieldIter.Current()
+		field, curr := fieldIter.Current()
 		if !fti.opts.allow(field) {
 			continue
 		}
+
+		if index.MigrationReadOnlyPostings() && fti.restrictByPostings != nil {
+			// Check term isn't part of at least some of the documents we're
+			// restricted to providing results for based on intersection
+			// count.
+			restrictBy := fti.restrictByPostings
+			match, err := fti.restrictByPostingsIntersect.Intersects(restrictBy, curr)
+			if err != nil {
+				fti.err = err
+				return false
+			}
+			if !match {
+				// No match.
+				continue
+			}
+		} else if !index.MigrationReadOnlyPostings() && fti.restrictByPostingsBitmap != nil {
+			bitmap, ok := roaring.BitmapFromPostingsList(curr)
+			if !ok {
+				fti.err = errUnpackBitmapFromPostingsList
+				return false
+			}
+
+			// Check term isn part of at least some of the documents we're
+			// restricted to providing results for based on intersection
+			// count.
+			// Note: IntersectionCount is significantly faster than intersecting and
+			// counting results and also does not allocate.
+			if n := fti.restrictByPostingsBitmap.IntersectionCount(bitmap); n < 1 {
+				// No match.
+				continue
+			}
+		}
+
 		fti.current.field = field
 		return true
 	}
diff --git a/src/dbnode/storage/index/filter_fields_iterator.go b/src/dbnode/storage/index/filter_fields_iterator.go
index 96ac7570e2..0276d026cc 100644
--- a/src/dbnode/storage/index/filter_fields_iterator.go
+++ b/src/dbnode/storage/index/filter_fields_iterator.go
@@ -21,66 +21,82 @@
 package index
 
 import (
+	"bytes"
 	"errors"
+	"sort"
 
 	"github.com/m3db/m3/src/m3ninx/index/segment"
+	"github.com/m3db/m3/src/m3ninx/postings"
 )
 
 var (
 	errNoFiltersSpecified = errors.New("no fields specified to filter upon")
 )
 
+var _ segment.FieldsPostingsListIterator = &filterFieldsIterator{}
+
+type filterFieldsIterator struct {
+	reader segment.Reader
+	sorted [][]byte
+	iter   segment.FieldsPostingsListIterator
+
+	currField         []byte
+	currFieldPostings postings.List
+}
+
 func newFilterFieldsIterator(
 	reader segment.Reader,
 	fields AggregateFieldFilter,
-) (segment.FieldsIterator, error) {
+) (segment.FieldsPostingsListIterator, error) {
 	if len(fields) == 0 {
 		return nil, errNoFiltersSpecified
 	}
+	sorted := make([][]byte, 0, len(fields))
+	for _, field := range fields {
+		sorted = append(sorted, field)
+	}
+	sort.Slice(sorted, func(i, j int) bool {
+		return bytes.Compare(sorted[i], sorted[j]) < 0
+	})
+	iter, err := reader.FieldsPostingsList()
+	if err != nil {
+		return nil, err
+	}
 	return &filterFieldsIterator{
-		reader:     reader,
-		fields:     fields,
-		currentIdx: -1,
+		reader: reader,
+		sorted: sorted,
+		iter:   iter,
 	}, nil
 }
 
-type filterFieldsIterator struct {
-	reader segment.Reader
-	fields AggregateFieldFilter
-
-	err        error
-	currentIdx int
-}
-
-var _ segment.FieldsIterator = &filterFieldsIterator{}
-
 func (f *filterFieldsIterator) Next() bool {
-	if f.err != nil {
-		return false
-	}
-
-	f.currentIdx++ // required because we start at -1
-	for f.currentIdx < len(f.fields) {
-		field := f.fields[f.currentIdx]
-
-		ok, err := f.reader.ContainsField(field)
-		if err != nil {
-			f.err = err
-			return false
+	for f.iter.Next() && len(f.sorted) > 0 {
+		f.currField, f.currFieldPostings = f.iter.Current()
+		cmpResult := bytes.Compare(f.currField, f.sorted[0])
+		if cmpResult < 0 {
+			// This result appears before the next sorted filter.
+			continue
 		}
-
-		// i.e. we found a field from the filter list contained in the segment.
-		if ok {
-			return true
+		if cmpResult > 0 {
+			// Result appears after last sorted entry filtering too, no more.
+			return false
 		}
 
-		// the current field is unsuitable, so we skip to the next possiblity.
-		f.currentIdx++
+		f.sorted = f.sorted[1:]
+		return true
 	}
 
 	return false
 }
 
-func (f *filterFieldsIterator) Current() []byte { return f.fields[f.currentIdx] }
-func (f *filterFieldsIterator) Err() error      { return f.err }
-func (f *filterFieldsIterator) Close() error    { return nil }
+func (f *filterFieldsIterator) Current() ([]byte, postings.List) {
+	return f.currField, f.currFieldPostings
+}
+
+func (f *filterFieldsIterator) Err() error {
+	return f.iter.Err()
+}
+
+func (f *filterFieldsIterator) Close() error {
+	return f.iter.Close()
+}
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index 76665cef34..8d93fe1a13 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -277,6 +277,11 @@ func (s *readThroughSegmentReader) Fields() (segment.FieldsIterator, error) {
 	return s.reader.Fields()
 }
 
+// FieldsPostingsList is a pass through call.
+func (s *readThroughSegmentReader) FieldsPostingsList() (segment.FieldsPostingsListIterator, error) {
+	return s.reader.FieldsPostingsList()
+}
+
 // ContainsField is a pass through call.
 func (s *readThroughSegmentReader) ContainsField(field []byte) (bool, error) {
 	return s.reader.ContainsField(field)
diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index c5c9049c93..0fb7e5717b 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -360,6 +360,22 @@ func (i *termsIterable) Terms(field []byte) (sgmt.TermsIterator, error) {
 	return i.termsNotClosedMaybeFinalizedWithRLock(field)
 }
 
+func (i *termsIterable) fieldsNotClosedMaybeFinalizedWithRLock() (sgmt.FieldsPostingsListIterator, error) {
+	// NB(r): Not closed, but could be finalized (i.e. closed segment reader)
+	// calling match field after this segment is finalized.
+	if i.r.finalized {
+		return nil, errReaderFinalized
+	}
+
+	i.fieldsIter.reset(fstTermsIterOpts{
+		seg:         i.r,
+		fst:         i.r.fieldsFST,
+		finalizeFST: false,
+	})
+	i.postingsIter.reset(i.r, i.fieldsIter)
+	return i.postingsIter, nil
+}
+
 func (i *termsIterable) termsNotClosedMaybeFinalizedWithRLock(
 	field []byte,
 ) (sgmt.TermsIterator, error) {
@@ -862,6 +878,19 @@ func (sr *fsSegmentReader) Fields() (sgmt.FieldsIterator, error) {
 	return iter, nil
 }
 
+func (sr *fsSegmentReader) FieldsPostingsList() (sgmt.FieldsPostingsListIterator, error) {
+	if sr.closed {
+		return nil, errReaderClosed
+	}
+	if sr.termsIterable == nil {
+		sr.termsIterable = newTermsIterable(sr.fsSegment)
+	}
+	sr.fsSegment.RLock()
+	iter, err := sr.termsIterable.fieldsNotClosedMaybeFinalizedWithRLock()
+	sr.fsSegment.RUnlock()
+	return iter, err
+}
+
 func (sr *fsSegmentReader) ContainsField(field []byte) (bool, error) {
 	if sr.closed {
 		return false, errReaderClosed
diff --git a/src/m3ninx/index/segment/mem/reader.go b/src/m3ninx/index/segment/mem/reader.go
index c029181efb..e77a72ece6 100644
--- a/src/m3ninx/index/segment/mem/reader.go
+++ b/src/m3ninx/index/segment/mem/reader.go
@@ -38,7 +38,7 @@ var (
 type reader struct {
 	sync.RWMutex
 
-	segment ReadableSegment
+	segment *memSegment
 	limits  readerDocRange
 	plPool  postings.Pool
 
@@ -50,7 +50,7 @@ type readerDocRange struct {
 	endExclusive   postings.ID
 }
 
-func newReader(s ReadableSegment, l readerDocRange, p postings.Pool) sgmt.Reader {
+func newReader(s *memSegment, l readerDocRange, p postings.Pool) sgmt.Reader {
 	return &reader{
 		segment: s,
 		limits:  l,
@@ -62,6 +62,10 @@ func (r *reader) Fields() (sgmt.FieldsIterator, error) {
 	return r.segment.Fields()
 }
 
+func (r *reader) FieldsPostingsList() (sgmt.FieldsPostingsListIterator, error) {
+	return r.segment.FieldsPostingsList()
+}
+
 func (r *reader) ContainsField(field []byte) (bool, error) {
 	return r.segment.ContainsField(field)
 }
diff --git a/src/m3ninx/index/segment/types.go b/src/m3ninx/index/segment/types.go
index 566a25bd00..94ee167716 100644
--- a/src/m3ninx/index/segment/types.go
+++ b/src/m3ninx/index/segment/types.go
@@ -69,6 +69,7 @@ type Reader interface {
 	index.Reader
 	FieldsIterable
 	TermsIterable
+	FieldsPostingsListIterable
 
 	// ContainsField returns a bool indicating if the Segment contains the provided field.
 	ContainsField(field []byte) (bool, error)
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 6c2254a1ad..0aa6cbc93f 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -546,7 +546,7 @@ func newMultiBitmapContainersIterator(
 		opts.intersectNegate, multiContainerOpNegate)
 	return &multiBitmapContainersIterator{
 		multiBitmapOptions: opts,
-		initial:            iters,
+		initial:            initial,
 		iters:              iters,
 	}
 }

From 8d74ee15d553388114836d747bc8ccbe973db906 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 3 Nov 2020 02:59:26 -0500
Subject: [PATCH 016/106] Fix field filtering based on restrict by tags query

---
 .../prometheus/test.sh                        | 13 ++-
 .../fst/fst_terms_postings_iterator.go        | 21 +++--
 src/m3ninx/index/segment/fst/segment.go       | 83 +++++++++++++++----
 3 files changed, 90 insertions(+), 27 deletions(-)

diff --git a/scripts/docker-integration-tests/prometheus/test.sh b/scripts/docker-integration-tests/prometheus/test.sh
index 51d2621ba2..38d974027f 100755
--- a/scripts/docker-integration-tests/prometheus/test.sh
+++ b/scripts/docker-integration-tests/prometheus/test.sh
@@ -293,8 +293,9 @@ function test_query_restrict_tags {
 
   # First write some hidden metrics.
   echo "Test write with unaggregated metrics type works as expected"
-  TAG_NAME_0="restricted_metrics_type" TAG_VALUE_0="hidden" \
-    TAG_NAME_1="foo_tag" TAG_VALUE_1="foo_tag_value" \
+  TAG_NAME_0="__name__" TAG_VALUE_0="hidden_metric_by_default" \
+    TAG_NAME_1="restricted_metrics_type" TAG_VALUE_1="hidden" \
+    TAG_NAME_2="foo_tag" TAG_VALUE_2="foo_tag_value" \
     prometheus_remote_write \
     some_hidden_metric now 42.42 \
     true "Expected request to succeed" \
@@ -311,6 +312,14 @@ function test_query_restrict_tags {
   echo "Test restrict by tags with coordinator defaults"
   ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff  \
     '[[ $(curl -s 0.0.0.0:7201/api/v1/query?query=\\{restricted_metrics_type=\"hidden\"\\} | jq -r ".data.result | length") -eq 0 ]]'
+
+  # Check we can see metric in labels endpoint with zero restrictions applied.
+  ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff  \
+    '[[ $(curl -s -H "M3-Restrict-By-Tags-JSON: {}" 0.0.0.0:7201/api/v1/labels | jq ".data | map(select(. == \"hidden_metric_by_default\")) | length") -eq 1 ]]'
+
+  # Now check that without easing restrictions that we can see the metric in question.
+  ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff  \
+    '[[ $(curl -s 0.0.0.0:7201/api/v1/labels | jq ".data | map(select(. == \"hidden_metric_by_default\")) | length") -eq 0 ]]'
 }
 
 function test_series {
diff --git a/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go b/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
index c74a7cb15d..34b275299d 100644
--- a/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
+++ b/src/m3ninx/index/segment/fst/fst_terms_postings_iterator.go
@@ -39,15 +39,19 @@ var postingsIterRoaringPoolingConfig = pilosaroaring.ContainerPoolingConfigurati
 	MaxKeysAndContainersSliceLength: 128 * 10,
 }
 
+var _ sgmt.TermsIterator = &fstTermsPostingsIter{}
+var _ sgmt.FieldsPostingsListIterator = &fstTermsPostingsIter{}
+
 type fstTermsPostingsIter struct {
 	bitmap       *roaring.ReadOnlyBitmap
 	legacyBitmap *pilosaroaring.Bitmap
 	legacyList   postings.List
 
-	seg       *fsSegment
-	termsIter *fstTermsIter
-	currTerm  []byte
-	err       error
+	seg          *fsSegment
+	termsIter    *fstTermsIter
+	currTerm     []byte
+	fieldOffsets bool
+	err          error
 }
 
 func newFSTTermsPostingsIter() *fstTermsPostingsIter {
@@ -69,8 +73,6 @@ func newFSTTermsPostingsIter() *fstTermsPostingsIter {
 	return i
 }
 
-var _ sgmt.TermsIterator = &fstTermsPostingsIter{}
-
 func (f *fstTermsPostingsIter) clear() {
 	if index.MigrationReadOnlyPostings() {
 		f.bitmap.Reset(nil)
@@ -80,17 +82,20 @@ func (f *fstTermsPostingsIter) clear() {
 	f.seg = nil
 	f.termsIter = nil
 	f.currTerm = nil
+	f.fieldOffsets = false
 	f.err = nil
 }
 
 func (f *fstTermsPostingsIter) reset(
 	seg *fsSegment,
 	termsIter *fstTermsIter,
+	fieldOffsets bool,
 ) {
 	f.clear()
 
 	f.seg = seg
 	f.termsIter = termsIter
+	f.fieldOffsets = fieldOffsets
 }
 
 func (f *fstTermsPostingsIter) Next() bool {
@@ -109,10 +114,10 @@ func (f *fstTermsPostingsIter) Next() bool {
 	f.seg.RLock()
 	if index.MigrationReadOnlyPostings() {
 		f.err = f.seg.unmarshalReadOnlyBitmapNotClosedMaybeFinalizedWithLock(f.bitmap,
-			currOffset)
+			currOffset, f.fieldOffsets)
 	} else {
 		f.err = f.seg.unmarshalBitmapNotClosedMaybeFinalizedWithLock(f.legacyBitmap,
-			currOffset)
+			currOffset, f.fieldOffsets)
 	}
 	f.seg.RUnlock()
 
diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index 0fb7e5717b..fe1ec392db 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -372,7 +372,7 @@ func (i *termsIterable) fieldsNotClosedMaybeFinalizedWithRLock() (sgmt.FieldsPos
 		fst:         i.r.fieldsFST,
 		finalizeFST: false,
 	})
-	i.postingsIter.reset(i.r, i.fieldsIter)
+	i.postingsIter.reset(i.r, i.fieldsIter, true)
 	return i.postingsIter, nil
 }
 
@@ -399,31 +399,79 @@ func (i *termsIterable) termsNotClosedMaybeFinalizedWithRLock(
 		fst:         termsFST,
 		finalizeFST: true,
 	})
-	i.postingsIter.reset(i.r, i.fieldsIter)
+	i.postingsIter.reset(i.r, i.fieldsIter, false)
 	return i.postingsIter, nil
 }
 
-func (r *fsSegment) unmarshalReadOnlyBitmapNotClosedMaybeFinalizedWithLock(b *roaring.ReadOnlyBitmap, offset uint64) error {
+func (r *fsSegment) unmarshalReadOnlyBitmapNotClosedMaybeFinalizedWithLock(
+	b *roaring.ReadOnlyBitmap,
+	offset uint64,
+	fieldsOffset bool,
+) error {
 	if r.finalized {
 		return errReaderFinalized
 	}
 
-	postingsBytes, err := r.retrieveBytesWithRLock(r.data.PostingsData.Bytes, offset)
-	if err != nil {
-		return fmt.Errorf("unable to retrieve postings data: %v", err)
+	var postingsBytes []byte
+	if fieldsOffset {
+		protoBytes, _, err := r.retrieveTermsBytesWithRLock(r.data.FSTTermsData.Bytes, offset)
+		if err != nil {
+			return err
+		}
+
+		var fieldData fswriter.FieldData
+		if err := fieldData.Unmarshal(protoBytes); err != nil {
+			return err
+		}
+
+		postingsOffset := fieldData.FieldPostingsListOffset
+		postingsBytes, err = r.retrieveBytesWithRLock(r.data.PostingsData.Bytes, postingsOffset)
+		if err != nil {
+			return fmt.Errorf("unable to retrieve postings data: %v", err)
+		}
+	} else {
+		var err error
+		postingsBytes, err = r.retrieveBytesWithRLock(r.data.PostingsData.Bytes, offset)
+		if err != nil {
+			return fmt.Errorf("unable to retrieve postings data: %v", err)
+		}
 	}
 
 	return b.Reset(postingsBytes)
 }
 
-func (r *fsSegment) unmarshalBitmapNotClosedMaybeFinalizedWithLock(b *pilosaroaring.Bitmap, offset uint64) error {
+func (r *fsSegment) unmarshalBitmapNotClosedMaybeFinalizedWithLock(
+	b *pilosaroaring.Bitmap,
+	offset uint64,
+	fieldsOffset bool,
+) error {
 	if r.finalized {
 		return errReaderFinalized
 	}
 
-	postingsBytes, err := r.retrieveBytesWithRLock(r.data.PostingsData.Bytes, offset)
-	if err != nil {
-		return fmt.Errorf("unable to retrieve postings data: %v", err)
+	var postingsBytes []byte
+	if fieldsOffset {
+		protoBytes, _, err := r.retrieveTermsBytesWithRLock(r.data.FSTTermsData.Bytes, offset)
+		if err != nil {
+			return err
+		}
+
+		var fieldData fswriter.FieldData
+		if err := fieldData.Unmarshal(protoBytes); err != nil {
+			return err
+		}
+
+		postingsOffset := fieldData.FieldPostingsListOffset
+		postingsBytes, err = r.retrieveBytesWithRLock(r.data.PostingsData.Bytes, postingsOffset)
+		if err != nil {
+			return fmt.Errorf("unable to retrieve postings data: %v", err)
+		}
+	} else {
+		var err error
+		postingsBytes, err = r.retrieveBytesWithRLock(r.data.PostingsData.Bytes, offset)
+		if err != nil {
+			return fmt.Errorf("unable to retrieve postings data: %v", err)
+		}
 	}
 
 	return b.UnmarshalBinary(postingsBytes)
@@ -848,10 +896,11 @@ var _ sgmt.Reader = (*fsSegmentReader)(nil)
 // fsSegmentReader is not thread safe for use and relies on the underlying
 // segment for synchronization.
 type fsSegmentReader struct {
-	closed        bool
-	ctx           context.Context
-	fsSegment     *fsSegment
-	termsIterable *termsIterable
+	closed         bool
+	ctx            context.Context
+	fsSegment      *fsSegment
+	fieldsIterable *termsIterable
+	termsIterable  *termsIterable
 }
 
 func newReader(
@@ -882,11 +931,11 @@ func (sr *fsSegmentReader) FieldsPostingsList() (sgmt.FieldsPostingsListIterator
 	if sr.closed {
 		return nil, errReaderClosed
 	}
-	if sr.termsIterable == nil {
-		sr.termsIterable = newTermsIterable(sr.fsSegment)
+	if sr.fieldsIterable == nil {
+		sr.fieldsIterable = newTermsIterable(sr.fsSegment)
 	}
 	sr.fsSegment.RLock()
-	iter, err := sr.termsIterable.fieldsNotClosedMaybeFinalizedWithRLock()
+	iter, err := sr.fieldsIterable.fieldsNotClosedMaybeFinalizedWithRLock()
 	sr.fsSegment.RUnlock()
 	return iter, err
 }

From 8da22f2f35f34398fa074a11f71822897c2ede9a Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 3 Nov 2020 03:08:44 -0500
Subject: [PATCH 017/106] Close iterator just once

---
 src/dbnode/storage/index/block.go                    | 10 +++++-----
 src/m3ninx/postings/roaring/bitmap_multi_readonly.go |  6 ------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index b35a035cb5..62239a47ab 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -681,7 +681,6 @@ func (b *block) aggregateWithSpan(
 		if err != nil {
 			return false, err
 		}
-		iterClosed = false // only once the iterator has been successfully Reset().
 
 		for iter.Next() {
 			if opts.LimitsExceeded(size, docsCount) {
@@ -703,11 +702,12 @@ func (b *block) aggregateWithSpan(
 		if err := iter.Err(); err != nil {
 			return false, err
 		}
+	}
 
-		iterClosed = true
-		if err := iter.Close(); err != nil {
-			return false, err
-		}
+	// Close iterator just once, otherwise resources free'd before Reset called.
+	iterClosed = true
+	if err := iter.Close(); err != nil {
+		return false, err
 	}
 
 	// Add last batch to results if remaining.
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 0aa6cbc93f..b8a2c37a00 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -784,12 +784,6 @@ func (b *bitmapContainer) Reset(set bool) {
 	b.bitmap = b.allocated
 }
 
-func (b *bitmapContainer) readOnlyContainer() bitmapReadOnlyContainer {
-	return bitmapReadOnlyContainer{
-		values: b.bitmap,
-	}
-}
-
 type bitmapContainerIterator struct {
 	containerKey     uint64
 	bitmap           *bitmapContainer

From 4ea2ec6c183f606f097702c4c0537a518b2e2dce Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 3 Nov 2020 03:22:57 -0500
Subject: [PATCH 018/106] Fix integration test

---
 .../prometheus/test.sh                        | 17 +++++++++++++----
 .../storage/index/fields_terms_iterator.go    | 19 ++++++++++++++++++-
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/scripts/docker-integration-tests/prometheus/test.sh b/scripts/docker-integration-tests/prometheus/test.sh
index 38d974027f..a19239e9b0 100755
--- a/scripts/docker-integration-tests/prometheus/test.sh
+++ b/scripts/docker-integration-tests/prometheus/test.sh
@@ -295,7 +295,8 @@ function test_query_restrict_tags {
   echo "Test write with unaggregated metrics type works as expected"
   TAG_NAME_0="__name__" TAG_VALUE_0="hidden_metric_by_default" \
     TAG_NAME_1="restricted_metrics_type" TAG_VALUE_1="hidden" \
-    TAG_NAME_2="foo_tag" TAG_VALUE_2="foo_tag_value" \
+    TAG_NAME_2="hidden_label_by_default" TAG_VALUE_2="hidden" \
+    TAG_NAME_3="foo_tag" TAG_VALUE_3="foo_tag_value" \
     prometheus_remote_write \
     some_hidden_metric now 42.42 \
     true "Expected request to succeed" \
@@ -313,13 +314,21 @@ function test_query_restrict_tags {
   ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff  \
     '[[ $(curl -s 0.0.0.0:7201/api/v1/query?query=\\{restricted_metrics_type=\"hidden\"\\} | jq -r ".data.result | length") -eq 0 ]]'
 
+  # Check we can see metric in label values endpoint with zero restrictions applied.
+  ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff  \
+    '[[ $(curl -s -H "M3-Restrict-By-Tags-JSON: {}" 0.0.0.0:7201/api/v1/label/__name__/values | jq ".data | map(select(. == \"hidden_metric_by_default\")) | length") -eq 1 ]]'
+
+  # Now check that without easing restrictions that we can see the label value in question.
+  ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff  \
+    '[[ $(curl -s 0.0.0.0:7201/api/v1/label/__name__/values | jq ".data | map(select(. == \"hidden_metric_by_default\")) | length") -eq 0 ]]'
+
   # Check we can see metric in labels endpoint with zero restrictions applied.
   ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff  \
-    '[[ $(curl -s -H "M3-Restrict-By-Tags-JSON: {}" 0.0.0.0:7201/api/v1/labels | jq ".data | map(select(. == \"hidden_metric_by_default\")) | length") -eq 1 ]]'
+    '[[ $(curl -s -H "M3-Restrict-By-Tags-JSON: {}" 0.0.0.0:7201/api/v1/labels | jq ".data | map(select(. == \"hidden_label_by_default\")) | length") -eq 1 ]]'
 
-  # Now check that without easing restrictions that we can see the metric in question.
+  # Now check that without easing restrictions that we can see the label in question.
   ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 retry_with_backoff  \
-    '[[ $(curl -s 0.0.0.0:7201/api/v1/labels | jq ".data | map(select(. == \"hidden_metric_by_default\")) | length") -eq 0 ]]'
+    '[[ $(curl -s 0.0.0.0:7201/api/v1/labels | jq ".data | map(select(. == \"hidden_label_by_default\")) | length") -eq 0 ]]'
 }
 
 function test_series {
diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index adc9927f5a..e6d38f5a26 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -114,9 +114,21 @@ func (fti *fieldsAndTermsIter) Reset(
 	reader segment.Reader,
 	opts fieldsAndTermsIteratorOpts,
 ) error {
+	// Keep restrict by postings intersect check until completely closed.
 	restrictByPostingsIntersect := fti.restrictByPostingsIntersect
+
+	// Close per use items.
+	if multiErr := fti.closePerUse(); multiErr.FinalError() != nil {
+		return multiErr.FinalError()
+	}
+
+	// Zero state.
 	*fti = fieldsAndTermsIterZeroed
+
+	// Restore restrict by postings intersect check.
 	fti.restrictByPostingsIntersect = restrictByPostingsIntersect
+
+	// Set per use fields.
 	fti.reader = reader
 	fti.opts = opts
 	if reader == nil {
@@ -329,7 +341,7 @@ func (fti *fieldsAndTermsIter) Err() error {
 	return fti.err
 }
 
-func (fti *fieldsAndTermsIter) Close() error {
+func (fti *fieldsAndTermsIter) closePerUse() xerrors.MultiError {
 	var multiErr xerrors.MultiError
 	if fti.fieldIter != nil {
 		multiErr = multiErr.Add(fti.fieldIter.Close())
@@ -337,6 +349,11 @@ func (fti *fieldsAndTermsIter) Close() error {
 	if fti.termIter != nil {
 		multiErr = multiErr.Add(fti.termIter.Close())
 	}
+	return multiErr
+}
+
+func (fti *fieldsAndTermsIter) Close() error {
+	multiErr := fti.closePerUse()
 	if fti.restrictByPostingsIntersect != nil {
 		multiErr = multiErr.Add(fti.restrictByPostingsIntersect.Close())
 	}

From dbf3148a84d9e34b055a682b3fb067cb8b0b009e Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 3 Nov 2020 20:51:44 -0500
Subject: [PATCH 019/106] Add pooling of regexp searcher and multi bitmap
 iterators

---
 src/m3ninx/index/segment/fst/segment.go       |  70 ++++++--
 .../postings/roaring/bitmap_multi_readonly.go | 166 +++++++++++++-----
 src/m3ninx/x/safe_closer.go                   |  13 +-
 3 files changed, 189 insertions(+), 60 deletions(-)

diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index fe1ec392db..562dae8782 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -576,6 +576,46 @@ func (r *fsSegment) matchTermNotClosedMaybeFinalizedWithRLock(
 	return pl, nil
 }
 
+type regexpSearcher struct {
+	fstCloser  x.SafeCloser
+	iterCloser x.SafeCloser
+	iterAlloc  vellum.FSTIterator
+	iter       *vellum.FSTIterator
+	pls        []postings.List
+}
+
+func newRegexpSearcher() *regexpSearcher {
+	r := &regexpSearcher{
+		fstCloser:  x.NewSafeCloser(nil),
+		iterCloser: x.NewSafeCloser(nil),
+		pls:        make([]postings.List, 0, 16),
+	}
+	r.iter = &r.iterAlloc
+	return r
+}
+
+func (s *regexpSearcher) Reset() {
+	for i := range s.pls {
+		s.pls[i] = nil
+	}
+	s.pls = s.pls[:0]
+}
+
+var regexpSearcherPool = sync.Pool{
+	New: func() interface{} {
+		return newRegexpSearcher()
+	},
+}
+
+func getRegexpSearcher() *regexpSearcher {
+	return regexpSearcherPool.Get().(*regexpSearcher)
+}
+
+func putRegexpSearcher(v *regexpSearcher) {
+	v.Reset()
+	regexpSearcherPool.Put(v)
+}
+
 func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 	field []byte,
 	compiled index.CompiledRegex,
@@ -607,16 +647,14 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 		return r.opts.PostingsListPool().Get(), nil
 	}
 
-	var (
-		fstCloser     = x.NewSafeCloser(termsFST)
-		iter, iterErr = termsFST.Search(re, compiled.PrefixBegin, compiled.PrefixEnd)
-		iterCloser    = x.NewSafeCloser(iter)
-		// NB(prateek): way quicker to union the PLs together at the end, rathen than one at a time.
-		pls []postings.List // TODO: pool this slice allocation
-	)
+	searcher := getRegexpSearcher()
+	iterErr := searcher.iter.Reset(termsFST, compiled.PrefixBegin, compiled.PrefixEnd, re)
+	searcher.fstCloser.Reset(termsFST)
+	searcher.iterCloser.Reset(searcher.iter)
 	defer func() {
-		iterCloser.Close()
-		fstCloser.Close()
+		searcher.fstCloser.Close()
+		searcher.iterCloser.Close()
+		putRegexpSearcher(searcher)
 	}()
 
 	for {
@@ -628,31 +666,31 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 			return nil, iterErr
 		}
 
-		_, postingsOffset := iter.Current()
+		_, postingsOffset := searcher.iter.Current()
 		nextPl, err := r.retrievePostingsListWithRLock(postingsOffset)
 		if err != nil {
 			return nil, err
 		}
-		pls = append(pls, nextPl)
-		iterErr = iter.Next()
+		searcher.pls = append(searcher.pls, nextPl)
+		iterErr = searcher.iter.Next()
 	}
 
 	var pl postings.List
 	if index.MigrationReadOnlyPostings() {
 		// Perform a lazy fast union.
-		pl, err = roaring.UnionReadOnly(pls)
+		pl, err = roaring.UnionReadOnly(searcher.pls)
 	} else {
-		pl, err = roaring.Union(pls)
+		pl, err = roaring.Union(searcher.pls)
 	}
 	if err != nil {
 		return nil, err
 	}
 
-	if err := iterCloser.Close(); err != nil {
+	if err := searcher.iterCloser.Close(); err != nil {
 		return nil, err
 	}
 
-	if err := fstCloser.Close(); err != nil {
+	if err := searcher.fstCloser.Close(); err != nil {
 		return nil, err
 	}
 
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index b8a2c37a00..399f64284b 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -236,30 +236,65 @@ func (i *multiBitmap) CountSlow() int {
 }
 
 func (i *multiBitmap) Iterator() postings.Iterator {
-	return newMultiBitmapIterator(i.multiBitmapOptions)
+	iter := getMultiBitmapIterator()
+	iter.Reset(i.multiBitmapOptions)
+	return iter
 }
 
 func (i *multiBitmap) ContainerIterator() containerIterator {
-	return newMultiBitmapContainersIterator(i.multiBitmapOptions)
+	iter := getMultiBitmapContainersIterator()
+	iter.Reset(i.multiBitmapOptions)
+	return iter
 }
 
 func (i *multiBitmap) Equal(other postings.List) bool {
 	return postings.Equal(i, other)
 }
 
+var multiBitmapIteratorPool = sync.Pool{
+	New: func() interface{} {
+		return newMultiBitmapIterator(multiBitmapOptions{})
+	},
+}
+
+func getMultiBitmapIterator() *multiBitmapIterator {
+	return multiBitmapIteratorPool.Get().(*multiBitmapIterator)
+}
+
+func putMultiBitmapIterator(v *multiBitmapIterator) {
+	v.Reset(multiBitmapOptions{})
+	multiBitmapIteratorPool.Put(v)
+}
+
+var multiBitmapContainersIteratorPool = sync.Pool{
+	New: func() interface{} {
+		return newMultiBitmapContainersIterator(multiBitmapOptions{})
+	},
+}
+
+func getMultiBitmapContainersIterator() *multiBitmapContainersIterator {
+	return multiBitmapContainersIteratorPool.Get().(*multiBitmapContainersIterator)
+}
+
+func putMultiBitmapContainersIterator(v *multiBitmapContainersIterator) {
+	v.Reset(multiBitmapOptions{})
+	multiBitmapContainersIteratorPool.Put(v)
+}
+
 var _ postings.Iterator = (*multiBitmapIterator)(nil)
 
 type multiBitmapIterator struct {
 	multiBitmapOptions
 
-	err                error
+	bitmap     *bitmapContainer
+	tempBitmap *bitmapContainer
+	bitmapIter bitmapContainerIterator
+
 	initial            []containerIteratorAndOp
 	iters              []containerIteratorAndOp
 	filtered           []containerIteratorAndOp
+	err                error
 	multiContainerIter multiBitmapContainerIterator
-	bitmap             *bitmapContainer
-	bitmapIter         bitmapContainerIterator
-	tempBitmap         *bitmapContainer
 }
 
 type containerIteratorAndOp struct {
@@ -295,15 +330,31 @@ func newMultiBitmapIterator(
 
 func (i *multiBitmapIterator) Reset(opts multiBitmapOptions) {
 	i.multiBitmapOptions = opts
+
 	n := len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
+	if i.initial == nil {
+		i.initial = make([]containerIteratorAndOp, 0, n)
+	}
+
 	if i.iters == nil {
 		i.iters = make([]containerIteratorAndOp, 0, n)
 	}
-	if i.initial == nil {
-		i.initial = make([]containerIteratorAndOp, 0, n)
+
+	for j := range i.initial {
+		i.initial[j] = containerIteratorAndOp{}
 	}
-	i.iters = i.iters[:0]
 	i.initial = i.initial[:0]
+
+	for j := range i.iters {
+		i.iters[j] = containerIteratorAndOp{}
+	}
+	i.iters = i.iters[:0]
+
+	for j := range i.filtered {
+		i.filtered[j] = containerIteratorAndOp{}
+	}
+	i.filtered = i.filtered[:0]
+
 	i.initial, i.iters = appendContainerItersWithOp(i.initial, i.iters,
 		opts.union, multiContainerOpUnion)
 	i.initial, i.iters = appendContainerItersWithOp(i.initial, i.iters,
@@ -452,13 +503,12 @@ func (i *multiBitmapIterator) Close() error {
 		iter.it.Close()
 	}
 
-	// Return bitmaps to pool.
-	putBitmapContainer(i.bitmap)
-	i.bitmap = nil
-	putBitmapContainer(i.tempBitmap)
-	i.tempBitmap = nil
-	// No longer reference the bitmap from iterator.
-	i.bitmapIter.Reset(0, nil)
+	// No longer reference anything any longer.
+	i.Reset(multiBitmapOptions{})
+
+	// Return this ref to the pool for re-use.
+	putMultiBitmapIterator(i)
+
 	return nil
 }
 
@@ -522,33 +572,60 @@ var _ containerIterator = (*multiBitmapContainersIterator)(nil)
 type multiBitmapContainersIterator struct {
 	multiBitmapOptions
 
-	err                error
+	tempBitmap *bitmapContainer
+
 	initial            []containerIteratorAndOp
 	iters              []containerIteratorAndOp
 	filtered           []containerIteratorAndOp
+	err                error
 	multiContainerIter multiBitmapContainerIterator
-	first              bool
 }
 
 func newMultiBitmapContainersIterator(
 	opts multiBitmapOptions,
 ) *multiBitmapContainersIterator {
-	var (
-		n       = len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
-		iters   = make([]containerIteratorAndOp, 0, n)
-		initial = make([]containerIteratorAndOp, 0, n)
-	)
-	initial, iters = appendContainerItersWithOp(initial, iters,
+	i := &multiBitmapContainersIterator{
+		tempBitmap: getBitmapContainer(),
+	}
+	i.Reset(opts)
+	return i
+}
+
+func (i *multiBitmapContainersIterator) Reset(opts multiBitmapOptions) {
+	i.multiBitmapOptions = opts
+
+	n := len(opts.union) + len(opts.intersect) + len(opts.intersectNegate)
+	if i.initial == nil {
+		i.initial = make([]containerIteratorAndOp, 0, n)
+	}
+
+	if i.iters == nil {
+		i.iters = make([]containerIteratorAndOp, 0, n)
+	}
+
+	for j := range i.initial {
+		i.initial[j] = containerIteratorAndOp{}
+	}
+	i.initial = i.initial[:0]
+
+	for j := range i.iters {
+		i.iters[j] = containerIteratorAndOp{}
+	}
+	i.iters = i.iters[:0]
+
+	for j := range i.filtered {
+		i.filtered[j] = containerIteratorAndOp{}
+	}
+	i.filtered = i.filtered[:0]
+
+	i.initial, i.iters = appendContainerItersWithOp(i.initial, i.iters,
 		opts.union, multiContainerOpUnion)
-	initial, iters = appendContainerItersWithOp(initial, iters,
+	i.initial, i.iters = appendContainerItersWithOp(i.initial, i.iters,
 		opts.intersect, multiContainerOpIntersect)
-	initial, iters = appendContainerItersWithOp(initial, iters,
+	i.initial, i.iters = appendContainerItersWithOp(i.initial, i.iters,
 		opts.intersectNegate, multiContainerOpNegate)
-	return &multiBitmapContainersIterator{
-		multiBitmapOptions: opts,
-		initial:            initial,
-		iters:              iters,
-	}
+	i.err = nil
+	i.multiContainerIter = multiBitmapContainerIterator{}
 }
 
 func (i *multiBitmapContainersIterator) NextContainer() bool {
@@ -614,8 +691,6 @@ func (i *multiBitmapContainersIterator) ContainerUnion(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempIntersectAndNegate(ctx)
-		defer putBitmapContainer(tempBitmap)
-
 		unionBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	}
 }
@@ -631,8 +706,6 @@ func (i *multiBitmapContainersIterator) ContainerIntersect(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempUnion(ctx)
-		defer putBitmapContainer(tempBitmap)
-
 		intersectBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	case multiBitmapOpIntersect:
 		// Need to build intermediate and intersect with target.
@@ -640,8 +713,6 @@ func (i *multiBitmapContainersIterator) ContainerIntersect(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempIntersectAndNegate(ctx)
-		defer putBitmapContainer(tempBitmap)
-
 		intersectBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	}
 }
@@ -657,8 +728,6 @@ func (i *multiBitmapContainersIterator) ContainerNegate(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempUnion(ctx)
-		defer putBitmapContainer(tempBitmap)
-
 		differenceBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	case multiBitmapOpIntersect:
 		// Need to build intermediate and intersect with target.
@@ -666,8 +735,6 @@ func (i *multiBitmapContainersIterator) ContainerNegate(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempIntersectAndNegate(ctx)
-		defer putBitmapContainer(tempBitmap)
-
 		differenceBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	}
 }
@@ -677,12 +744,23 @@ func (i *multiBitmapContainersIterator) Err() error {
 }
 
 func (i *multiBitmapContainersIterator) Close() {
+	// Close any iters that are left if we abort early.
+	for _, iter := range i.iters {
+		iter.it.Close()
+	}
+
+	// Release all refs.
+	i.Reset(multiBitmapOptions{})
+
+	// Return to pool.
+	putMultiBitmapContainersIterator(i)
 }
 
 func (i *multiBitmapContainersIterator) getTempUnion(
 	ctx containerOpContext,
 ) *bitmapContainer {
-	tempBitmap := getBitmapContainer()
+	tempBitmap := i.tempBitmap
+	tempBitmap.Reset(false)
 	union := i.filter(i.multiContainerIter.containerIters, multiContainerOpUnion)
 	for _, iter := range union {
 		iter.it.ContainerUnion(ctx, tempBitmap)
@@ -693,7 +771,7 @@ func (i *multiBitmapContainersIterator) getTempUnion(
 func (i *multiBitmapContainersIterator) getTempIntersectAndNegate(
 	ctx containerOpContext,
 ) *bitmapContainer {
-	tempBitmap := getBitmapContainer()
+	tempBitmap := i.tempBitmap
 
 	totalIntersect := len(i.filter(i.initial, multiContainerOpIntersect))
 	intersect := i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect)
@@ -703,11 +781,13 @@ func (i *multiBitmapContainersIterator) getTempIntersectAndNegate(
 	// there is zero overlap and so intersecting always results in
 	// no results for this container.
 	if totalIntersect != currIntersect {
+		tempBitmap.Reset(false)
 		return tempBitmap
 	}
 
 	if currIntersect == 0 {
 		// No intersections so only possible negations of nothing.
+		tempBitmap.Reset(false)
 		return tempBitmap
 	}
 
diff --git a/src/m3ninx/x/safe_closer.go b/src/m3ninx/x/safe_closer.go
index 69de7cd96b..2af7cc9db8 100644
--- a/src/m3ninx/x/safe_closer.go
+++ b/src/m3ninx/x/safe_closer.go
@@ -26,10 +26,16 @@ import (
 	xerrors "github.com/m3db/m3/src/x/errors"
 )
 
+// SafeCloser is a reuesable safe closer.
+type SafeCloser interface {
+	io.Closer
+	Reset(closer io.Closer)
+}
+
 // NewSafeCloser returns a io.Closer which ensures the
 // underlying Close() is only called once. It's
 // useful for cleanup of resources in functions.
-func NewSafeCloser(x io.Closer) io.Closer {
+func NewSafeCloser(x io.Closer) SafeCloser {
 	return &safeCloser{Closer: x}
 }
 
@@ -40,6 +46,11 @@ type safeCloser struct {
 	closed bool
 }
 
+func (c *safeCloser) Reset(closer io.Closer) {
+	c.Closer = closer
+	c.closed = false
+}
+
 // Close guarantees the underlying Closable's Close() is
 // only executed the first time it's called.
 func (c *safeCloser) Close() error {

From 339b4e7db07d0f1853e3c0d239feb4f0c31f220b Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 4 Nov 2020 01:58:42 -0500
Subject: [PATCH 020/106] Cache loaded vellum FST structs and reuse readers

---
 src/m3ninx/generated-source-files.mk          |  18 ++-
 src/m3ninx/index/segment/fst/segment.go       | 143 +++++++++++++-----
 .../postings/roaring/bitmap_multi_readonly.go |  23 ++-
 3 files changed, 142 insertions(+), 42 deletions(-)

diff --git a/src/m3ninx/generated-source-files.mk b/src/m3ninx/generated-source-files.mk
index 980fa83c3b..196e5ea0a4 100644
--- a/src/m3ninx/generated-source-files.mk
+++ b/src/m3ninx/generated-source-files.mk
@@ -13,7 +13,8 @@ genny-all: genny-map-all genny-arraypool-all
 
 # Map generation rule for all generated maps
 .PHONY: genny-map-all
-genny-map-all:                          \
+genny-map-all:                            \
+	genny-map-segment-fst                 \
 	genny-map-segment-builder-postingsmap \
 	genny-map-segment-builder-fieldsmap   \
 	genny-map-segment-builder-idsmap      \
@@ -31,6 +32,21 @@ genny-map-all:                          \
 #
 # [1]: https://github.com/cheekybits/genny
 
+# Map generation rule for index/segment/fst.fstMap
+.PHONY: genny-map-segment-fst
+genny-map-segment-fst:
+	cd $(m3x_package_path) && make byteshashmap-gen            \
+		pkg=fst                                                \
+		value_type=vellumFST                                   \
+		target_package=$(m3ninx_package)/index/segment/fst     \
+		rename_nogen_key=true                                  \
+		rename_type_prefix=fst                                 \
+		rename_constructor=newFSTMap                           \
+		rename_constructor_options=fstMapOptions
+	# Rename generated map file
+	mv -f $(m3ninx_package_path)/index/segment/fst/map_gen.go $(m3ninx_package_path)/index/segment/fst/fst_map_gen.go
+	mv -f $(m3ninx_package_path)/index/segment/fst/new_map_gen.go $(m3ninx_package_path)/index/segment/fst/fst_map_new.go
+
 # Map generation rule for index/segment/builder.PostingsMap
 .PHONY: genny-map-segment-builder-postingsmap
 genny-map-segment-builder-postingsmap:
diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index 562dae8782..fb920d65bc 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -149,9 +149,11 @@ func NewSegment(data SegmentData, opts Options) (Segment, error) {
 		docsIndexReader:      docsIndexReader,
 		docsThirdPartyReader: docsThirdPartyReader,
 
-		data:    data,
-		opts:    opts,
-		numDocs: metadata.NumDocs,
+		data: data,
+		opts: opts,
+
+		termFSTs: vellumFSTs{fstMap: newFSTMap(fstMapOptions{})},
+		numDocs:  metadata.NumDocs,
 	}
 
 	// NB(r): The segment uses the context finalization to finalize
@@ -179,7 +181,64 @@ type fsSegment struct {
 	data                 SegmentData
 	opts                 Options
 
-	numDocs int64
+	termFSTs vellumFSTs
+	numDocs  int64
+}
+
+type vellumFSTs struct {
+	sync.RWMutex
+	fstMap     *fstMap
+	readerPool *fstReaderPool
+}
+
+type vellumFST struct {
+	fst        *vellum.FST
+	readerPool *fstReaderPool
+}
+
+func newVellumFST(fst *vellum.FST) vellumFST {
+	return vellumFST{
+		fst:        fst,
+		readerPool: newFSTReaderPool(fst),
+	}
+}
+
+func (f vellumFST) Get(key []byte) (uint64, bool, error) {
+	reader, err := f.readerPool.Get()
+	if err != nil {
+		return 0, false, err
+	}
+	result, exists, err := reader.Get(key)
+	// Always return reader to pool.
+	f.readerPool.Put(reader)
+	return result, exists, err
+}
+
+type fstReaderPool struct {
+	pool sync.Pool
+}
+
+func newFSTReaderPool(fst *vellum.FST) *fstReaderPool {
+	return &fstReaderPool{
+		pool: sync.Pool{
+			New: func() interface{} {
+				r, _ := fst.Reader()
+				return r
+			},
+		},
+	}
+}
+
+func (p *fstReaderPool) Get() (*vellum.Reader, error) {
+	v := p.pool.Get().(*vellum.Reader)
+	if v == nil {
+		return nil, fmt.Errorf("vellum reader failed to initialize")
+	}
+	return v, nil
+}
+
+func (p *fstReaderPool) Put(v *vellum.Reader) {
+	p.pool.Put(v)
 }
 
 func (r *fsSegment) SegmentData(ctx context.Context) (SegmentData, error) {
@@ -221,12 +280,8 @@ func (r *fsSegment) ContainsID(docID []byte) (bool, error) {
 	}
 
 	_, exists, err = termsFST.Get(docID)
-	closeErr := termsFST.Close()
-	if err != nil {
-		return false, err
-	}
 
-	return exists, closeErr
+	return exists, err
 }
 
 func (r *fsSegment) ContainsField(field []byte) (bool, error) {
@@ -270,11 +325,26 @@ func (r *fsSegment) Close() error {
 
 func (r *fsSegment) Finalize() {
 	r.Lock()
+	if r.finalized {
+		r.Unlock()
+		return
+	}
+
+	r.finalized = true
+
+	r.termFSTs.Lock()
+	for _, elem := range r.termFSTs.fstMap.Iter() {
+		vellumFST := elem.Value()
+		vellumFST.fst.Close()
+	}
+	r.termFSTs.Unlock()
+
 	r.fieldsFST.Close()
+
 	if r.data.Closer != nil {
 		r.data.Closer.Close()
 	}
-	r.finalized = true
+
 	r.Unlock()
 }
 
@@ -396,8 +466,8 @@ func (i *termsIterable) termsNotClosedMaybeFinalizedWithRLock(
 
 	i.fieldsIter.reset(fstTermsIterOpts{
 		seg:         i.r,
-		fst:         termsFST,
-		finalizeFST: true,
+		fst:         termsFST.fst,
+		finalizeFST: false,
 	})
 	i.postingsIter.reset(i.r, i.fieldsIter, false)
 	return i.postingsIter, nil
@@ -545,9 +615,6 @@ func (r *fsSegment) matchTermNotClosedMaybeFinalizedWithRLock(
 		return r.opts.PostingsListPool().Get(), nil
 	}
 
-	fstCloser := x.NewSafeCloser(termsFST)
-	defer fstCloser.Close()
-
 	postingsOffset, exists, err := termsFST.Get(term)
 	if err != nil {
 		return nil, err
@@ -569,15 +636,10 @@ func (r *fsSegment) matchTermNotClosedMaybeFinalizedWithRLock(
 		return nil, err
 	}
 
-	if err := fstCloser.Close(); err != nil {
-		return nil, err
-	}
-
 	return pl, nil
 }
 
 type regexpSearcher struct {
-	fstCloser  x.SafeCloser
 	iterCloser x.SafeCloser
 	iterAlloc  vellum.FSTIterator
 	iter       *vellum.FSTIterator
@@ -586,7 +648,6 @@ type regexpSearcher struct {
 
 func newRegexpSearcher() *regexpSearcher {
 	r := &regexpSearcher{
-		fstCloser:  x.NewSafeCloser(nil),
 		iterCloser: x.NewSafeCloser(nil),
 		pls:        make([]postings.List, 0, 16),
 	}
@@ -648,11 +709,9 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 	}
 
 	searcher := getRegexpSearcher()
-	iterErr := searcher.iter.Reset(termsFST, compiled.PrefixBegin, compiled.PrefixEnd, re)
-	searcher.fstCloser.Reset(termsFST)
+	iterErr := searcher.iter.Reset(termsFST.fst, compiled.PrefixBegin, compiled.PrefixEnd, re)
 	searcher.iterCloser.Reset(searcher.iter)
 	defer func() {
-		searcher.fstCloser.Close()
 		searcher.iterCloser.Close()
 		putRegexpSearcher(searcher)
 	}()
@@ -690,10 +749,6 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 		return nil, err
 	}
 
-	if err := searcher.fstCloser.Close(); err != nil {
-		return nil, err
-	}
-
 	return pl, nil
 }
 
@@ -780,27 +835,47 @@ func (r *fsSegment) retrievePostingsListWithRLock(postingsOffset uint64) (postin
 	return pilosa.Unmarshal(postingsBytes)
 }
 
-func (r *fsSegment) retrieveTermsFSTWithRLock(field []byte) (*vellum.FST, bool, error) {
+func (r *fsSegment) retrieveTermsFSTWithRLock(field []byte) (vellumFST, bool, error) {
+	r.termFSTs.RLock()
+	fst, ok := r.termFSTs.fstMap.Get(field)
+	r.termFSTs.RUnlock()
+	if ok {
+		return fst, true, nil
+	}
+
+	r.termFSTs.Lock()
+	defer r.termFSTs.Lock()
+
+	fst, ok = r.termFSTs.fstMap.Get(field)
+	if ok {
+		return fst, true, nil
+	}
+
 	termsFSTOffset, exists, err := r.fieldsFST.Get(field)
 	if err != nil {
-		return nil, false, err
+		return vellumFST{}, false, err
 	}
 
 	if !exists {
-		return nil, false, nil
+		return vellumFST{}, false, nil
 	}
 
 	termsFSTBytes, err := r.retrieveBytesWithRLock(r.data.FSTTermsData.Bytes, termsFSTOffset)
 	if err != nil {
-		return nil, false, fmt.Errorf("error while decoding terms fst: %v", err)
+		return vellumFST{}, false, fmt.Errorf("error while decoding terms fst: %v", err)
 	}
 
 	termsFST, err := vellum.Load(termsFSTBytes)
 	if err != nil {
-		return nil, false, fmt.Errorf("error while loading terms fst: %v", err)
+		return vellumFST{}, false, fmt.Errorf("error while loading terms fst: %v", err)
 	}
 
-	return termsFST, true, nil
+	// Save FST to FST map.
+	vellumFST := newVellumFST(termsFST)
+	r.termFSTs.fstMap.Set(field, vellumFST)
+
+	// Return result.
+	return vellumFST, true, nil
 }
 
 // retrieveTermsBytesWithRLock assumes the base []byte slice is a collection of
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 399f64284b..2f60fedae3 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -572,8 +572,6 @@ var _ containerIterator = (*multiBitmapContainersIterator)(nil)
 type multiBitmapContainersIterator struct {
 	multiBitmapOptions
 
-	tempBitmap *bitmapContainer
-
 	initial            []containerIteratorAndOp
 	iters              []containerIteratorAndOp
 	filtered           []containerIteratorAndOp
@@ -584,9 +582,7 @@ type multiBitmapContainersIterator struct {
 func newMultiBitmapContainersIterator(
 	opts multiBitmapOptions,
 ) *multiBitmapContainersIterator {
-	i := &multiBitmapContainersIterator{
-		tempBitmap: getBitmapContainer(),
-	}
+	i := &multiBitmapContainersIterator{}
 	i.Reset(opts)
 	return i
 }
@@ -691,6 +687,8 @@ func (i *multiBitmapContainersIterator) ContainerUnion(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempIntersectAndNegate(ctx)
+		defer putBitmapContainer(tempBitmap)
+
 		unionBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	}
 }
@@ -706,6 +704,8 @@ func (i *multiBitmapContainersIterator) ContainerIntersect(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempUnion(ctx)
+		defer putBitmapContainer(tempBitmap)
+
 		intersectBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	case multiBitmapOpIntersect:
 		// Need to build intermediate and intersect with target.
@@ -713,6 +713,8 @@ func (i *multiBitmapContainersIterator) ContainerIntersect(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempIntersectAndNegate(ctx)
+		defer putBitmapContainer(tempBitmap)
+
 		intersectBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	}
 }
@@ -728,6 +730,8 @@ func (i *multiBitmapContainersIterator) ContainerNegate(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempUnion(ctx)
+		defer putBitmapContainer(tempBitmap)
+
 		differenceBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	case multiBitmapOpIntersect:
 		// Need to build intermediate and intersect with target.
@@ -735,6 +739,8 @@ func (i *multiBitmapContainersIterator) ContainerNegate(
 		// may use it when we call iter.it.ContainerFoo(...) so
 		// we use a specific intermediary here.
 		tempBitmap := i.getTempIntersectAndNegate(ctx)
+		defer putBitmapContainer(tempBitmap)
+
 		differenceBitmapInPlace(target.bitmap, tempBitmap.bitmap)
 	}
 }
@@ -759,19 +765,22 @@ func (i *multiBitmapContainersIterator) Close() {
 func (i *multiBitmapContainersIterator) getTempUnion(
 	ctx containerOpContext,
 ) *bitmapContainer {
-	tempBitmap := i.tempBitmap
+	tempBitmap := getBitmapContainer()
+
 	tempBitmap.Reset(false)
+
 	union := i.filter(i.multiContainerIter.containerIters, multiContainerOpUnion)
 	for _, iter := range union {
 		iter.it.ContainerUnion(ctx, tempBitmap)
 	}
+
 	return tempBitmap
 }
 
 func (i *multiBitmapContainersIterator) getTempIntersectAndNegate(
 	ctx containerOpContext,
 ) *bitmapContainer {
-	tempBitmap := i.tempBitmap
+	tempBitmap := getBitmapContainer()
 
 	totalIntersect := len(i.filter(i.initial, multiContainerOpIntersect))
 	intersect := i.filter(i.multiContainerIter.containerIters, multiContainerOpIntersect)

From 91afefa3aa66d0d86d20580112f97506e3f7382d Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 4 Nov 2020 02:13:46 -0500
Subject: [PATCH 021/106] Add generated map types

---
 src/m3ninx/index/segment/fst/fst_map_gen.go | 295 ++++++++++++++++++++
 src/m3ninx/index/segment/fst/fst_map_new.go |  91 ++++++
 2 files changed, 386 insertions(+)
 create mode 100644 src/m3ninx/index/segment/fst/fst_map_gen.go
 create mode 100644 src/m3ninx/index/segment/fst/fst_map_new.go

diff --git a/src/m3ninx/index/segment/fst/fst_map_gen.go b/src/m3ninx/index/segment/fst/fst_map_gen.go
new file mode 100644
index 0000000000..63ec00a568
--- /dev/null
+++ b/src/m3ninx/index/segment/fst/fst_map_gen.go
@@ -0,0 +1,295 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// This file was automatically generated by genny.
+// Any changes will be lost if this file is regenerated.
+// see https://github.com/mauricelam/genny
+
+package fst
+
+// Copyright (c) 2019 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// This file was automatically generated by genny.
+// Any changes will be lost if this file is regenerated.
+// see https://github.com/mauricelam/genny
+
+// Copyright (c) 2018 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// fstMapHash is the hash for a given map entry, this is public to support
+// iterating over the map using a native Go for loop.
+type fstMapHash uint64
+
+// fstMapHashFn is the hash function to execute when hashing a key.
+type fstMapHashFn func([]byte) fstMapHash
+
+// fstMapEqualsFn is the equals key function to execute when detecting equality of a key.
+type fstMapEqualsFn func([]byte, []byte) bool
+
+// fstMapCopyFn is the copy key function to execute when copying the key.
+type fstMapCopyFn func([]byte) []byte
+
+// fstMapFinalizeFn is the finalize key function to execute when finished with a key.
+type fstMapFinalizeFn func([]byte)
+
+// fstMap uses the genny package to provide a generic hash map that can be specialized
+// by running the following command from this root of the repository:
+// ```
+// make hashmap-gen pkg=outpkg key_type=Type value_type=Type out_dir=/tmp
+// ```
+// Or if you would like to use bytes or ident.ID as keys you can use the
+// partially specialized maps to generate your own maps as well:
+// ```
+// make byteshashmap-gen pkg=outpkg value_type=Type out_dir=/tmp
+// make idhashmap-gen pkg=outpkg value_type=Type out_dir=/tmp
+// ```
+// This will output to stdout the generated source file to use for your map.
+// It uses linear probing by incrementing the number of the hash created when
+// hashing the identifier if there is a collision.
+// fstMap is a value type and not an interface to allow for less painful
+// upgrades when adding/removing methods, it is not likely to need mocking so
+// an interface would not be super useful either.
+type fstMap struct {
+	_fstMapOptions
+
+	// lookup uses hash of the identifier for the key and the MapEntry value
+	// wraps the value type and the key (used to ensure lookup is correct
+	// when dealing with collisions), we use uint64 for the hash partially
+	// because lookups of maps with uint64 keys has a fast path for Go.
+	lookup map[fstMapHash]fstMapEntry
+}
+
+// _fstMapOptions is a set of options used when creating an identifier map, it is kept
+// private so that implementers of the generated map can specify their own options
+// that partially fulfill these options.
+type _fstMapOptions struct {
+	// hash is the hash function to execute when hashing a key.
+	hash fstMapHashFn
+	// equals is the equals key function to execute when detecting equality.
+	equals fstMapEqualsFn
+	// copy is the copy key function to execute when copying the key.
+	copy fstMapCopyFn
+	// finalize is the finalize key function to execute when finished with a
+	// key, this is optional to specify.
+	finalize fstMapFinalizeFn
+	// initialSize is the initial size for the map, use zero to use Go's std map
+	// initial size and consequently is optional to specify.
+	initialSize int
+}
+
+// fstMapEntry is an entry in the map, this is public to support iterating
+// over the map using a native Go for loop.
+type fstMapEntry struct {
+	// key is used to check equality on lookups to resolve collisions
+	key _fstMapKey
+	// value type stored
+	value vellumFST
+}
+
+type _fstMapKey struct {
+	key      []byte
+	finalize bool
+}
+
+// Key returns the map entry key.
+func (e fstMapEntry) Key() []byte {
+	return e.key.key
+}
+
+// Value returns the map entry value.
+func (e fstMapEntry) Value() vellumFST {
+	return e.value
+}
+
+// _fstMapAlloc is a non-exported function so that when generating the source code
+// for the map you can supply a public constructor that sets the correct
+// hash, equals, copy, finalize options without users of the map needing to
+// implement them themselves.
+func _fstMapAlloc(opts _fstMapOptions) *fstMap {
+	m := &fstMap{_fstMapOptions: opts}
+	m.Reallocate()
+	return m
+}
+
+func (m *fstMap) newMapKey(k []byte, opts _fstMapKeyOptions) _fstMapKey {
+	key := _fstMapKey{key: k, finalize: opts.finalizeKey}
+	if !opts.copyKey {
+		return key
+	}
+
+	key.key = m.copy(k)
+	return key
+}
+
+func (m *fstMap) removeMapKey(hash fstMapHash, key _fstMapKey) {
+	delete(m.lookup, hash)
+	if key.finalize {
+		m.finalize(key.key)
+	}
+}
+
+// Get returns a value in the map for an identifier if found.
+func (m *fstMap) Get(k []byte) (vellumFST, bool) {
+	hash := m.hash(k)
+	for entry, ok := m.lookup[hash]; ok; entry, ok = m.lookup[hash] {
+		if m.equals(entry.key.key, k) {
+			return entry.value, true
+		}
+		// Linear probe to "next" to this entry (really a rehash)
+		hash++
+	}
+	var empty vellumFST
+	return empty, false
+}
+
+// Set will set the value for an identifier.
+func (m *fstMap) Set(k []byte, v vellumFST) {
+	m.set(k, v, _fstMapKeyOptions{
+		copyKey:     true,
+		finalizeKey: m.finalize != nil,
+	})
+}
+
+// fstMapSetUnsafeOptions is a set of options to use when setting a value with
+// the SetUnsafe method.
+type fstMapSetUnsafeOptions struct {
+	NoCopyKey     bool
+	NoFinalizeKey bool
+}
+
+// SetUnsafe will set the value for an identifier with unsafe options for how
+// the map treats the key.
+func (m *fstMap) SetUnsafe(k []byte, v vellumFST, opts fstMapSetUnsafeOptions) {
+	m.set(k, v, _fstMapKeyOptions{
+		copyKey:     !opts.NoCopyKey,
+		finalizeKey: !opts.NoFinalizeKey,
+	})
+}
+
+type _fstMapKeyOptions struct {
+	copyKey     bool
+	finalizeKey bool
+}
+
+func (m *fstMap) set(k []byte, v vellumFST, opts _fstMapKeyOptions) {
+	hash := m.hash(k)
+	for entry, ok := m.lookup[hash]; ok; entry, ok = m.lookup[hash] {
+		if m.equals(entry.key.key, k) {
+			m.lookup[hash] = fstMapEntry{
+				key:   entry.key,
+				value: v,
+			}
+			return
+		}
+		// Linear probe to "next" to this entry (really a rehash)
+		hash++
+	}
+
+	m.lookup[hash] = fstMapEntry{
+		key:   m.newMapKey(k, opts),
+		value: v,
+	}
+}
+
+// Iter provides the underlying map to allow for using a native Go for loop
+// to iterate the map, however callers should only ever read and not write
+// the map.
+func (m *fstMap) Iter() map[fstMapHash]fstMapEntry {
+	return m.lookup
+}
+
+// Len returns the number of map entries in the map.
+func (m *fstMap) Len() int {
+	return len(m.lookup)
+}
+
+// Contains returns true if value exists for key, false otherwise, it is
+// shorthand for a call to Get that doesn't return the value.
+func (m *fstMap) Contains(k []byte) bool {
+	_, ok := m.Get(k)
+	return ok
+}
+
+// Delete will remove a value set in the map for the specified key.
+func (m *fstMap) Delete(k []byte) {
+	hash := m.hash(k)
+	for entry, ok := m.lookup[hash]; ok; entry, ok = m.lookup[hash] {
+		if m.equals(entry.key.key, k) {
+			m.removeMapKey(hash, entry.key)
+			return
+		}
+		// Linear probe to "next" to this entry (really a rehash)
+		hash++
+	}
+}
+
+// Reset will reset the map by simply deleting all keys to avoid
+// allocating a new map.
+func (m *fstMap) Reset() {
+	for hash, entry := range m.lookup {
+		m.removeMapKey(hash, entry.key)
+	}
+}
+
+// Reallocate will avoid deleting all keys and reallocate a new
+// map, this is useful if you believe you have a large map and
+// will not need to grow back to a similar size.
+func (m *fstMap) Reallocate() {
+	if m.initialSize > 0 {
+		m.lookup = make(map[fstMapHash]fstMapEntry, m.initialSize)
+	} else {
+		m.lookup = make(map[fstMapHash]fstMapEntry)
+	}
+}
diff --git a/src/m3ninx/index/segment/fst/fst_map_new.go b/src/m3ninx/index/segment/fst/fst_map_new.go
new file mode 100644
index 0000000000..87542e7b94
--- /dev/null
+++ b/src/m3ninx/index/segment/fst/fst_map_new.go
@@ -0,0 +1,91 @@
+// Copyright (c) 2020 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// This file was automatically generated by genny.
+// Any changes will be lost if this file is regenerated.
+// see https://github.com/mauricelam/genny
+
+package fst
+
+import (
+	"bytes"
+
+	"github.com/m3db/m3/src/x/pool"
+
+	"github.com/cespare/xxhash/v2"
+)
+
+// Copyright (c) 2018 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// fstMapOptions provides options used when created the map.
+type fstMapOptions struct {
+	InitialSize int
+	KeyCopyPool pool.BytesPool
+}
+
+// newFSTMap returns a new byte keyed map.
+func newFSTMap(opts fstMapOptions) *fstMap {
+	var (
+		copyFn     fstMapCopyFn
+		finalizeFn fstMapFinalizeFn
+	)
+	if pool := opts.KeyCopyPool; pool == nil {
+		copyFn = func(k []byte) []byte {
+			return append([]byte(nil), k...)
+		}
+	} else {
+		copyFn = func(k []byte) []byte {
+			keyLen := len(k)
+			pooled := pool.Get(keyLen)[:keyLen]
+			copy(pooled, k)
+			return pooled
+		}
+		finalizeFn = func(k []byte) {
+			pool.Put(k)
+		}
+	}
+	return _fstMapAlloc(_fstMapOptions{
+		hash: func(k []byte) fstMapHash {
+			return fstMapHash(xxhash.Sum64(k))
+		},
+		equals:      bytes.Equal,
+		copy:        copyFn,
+		finalize:    finalizeFn,
+		initialSize: opts.InitialSize,
+	})
+}

From 8d989e291194171b56596a4e8465baab1d5dff24 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 4 Nov 2020 02:56:20 -0500
Subject: [PATCH 022/106] Fix unlock

---
 src/m3ninx/index/segment/fst/segment.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index fb920d65bc..e3f6f1f2bf 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -844,7 +844,7 @@ func (r *fsSegment) retrieveTermsFSTWithRLock(field []byte) (vellumFST, bool, er
 	}
 
 	r.termFSTs.Lock()
-	defer r.termFSTs.Lock()
+	defer r.termFSTs.Unlock()
 
 	fst, ok = r.termFSTs.fstMap.Get(field)
 	if ok {

From ee0759885e9f9de6ada0db40ecdbe64e082b2c2b Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 9 Nov 2020 03:37:53 -0500
Subject: [PATCH 023/106] Reuse containers used for mutable postings lists

---
 go.mod                                    |   2 +-
 go.sum                                    |   4 +-
 src/dbnode/storage/index.go               | 190 ++++++++++++----------
 src/dbnode/storage/index_block_test.go    |   4 +-
 src/m3ninx/index/segment/builder/terms.go |   1 -
 src/m3ninx/postings/roaring/roaring.go    |   4 +-
 6 files changed, 116 insertions(+), 89 deletions(-)

diff --git a/go.mod b/go.mod
index 91e3ee308a..51f58b10ed 100644
--- a/go.mod
+++ b/go.mod
@@ -61,7 +61,7 @@ require (
 	github.com/m3db/stackadler32 v0.0.0-20180104200216-bfebcd73ef6f
 	github.com/m3db/stackmurmur3/v2 v2.0.2
 	github.com/m3db/tools v0.0.0-20181008195521-c6ded3f34878
-	github.com/m3dbx/pilosa v1.4.1
+	github.com/m3dbx/pilosa v1.4.2-0.20201109081833-6c9df43642fd
 	github.com/m3dbx/vellum v0.0.0-20200826162549-f94c029903de
 	github.com/mauricelam/genny v0.0.0-20180903214747-eb2c5232c885
 	github.com/mjibson/esc v0.1.0
diff --git a/go.sum b/go.sum
index 027a861899..2becad596d 100644
--- a/go.sum
+++ b/go.sum
@@ -538,8 +538,8 @@ github.com/m3db/thrift v0.0.0-20190820191926-05b5a2227fe4 h1:1x3mMuURd3wqKJ2qVjh
 github.com/m3db/thrift v0.0.0-20190820191926-05b5a2227fe4/go.mod h1:xVfRinGzD3cYDRvMjy6RkIwM+iNL2KHNLZjT0VpVZT8=
 github.com/m3db/tools v0.0.0-20181008195521-c6ded3f34878 h1:kww0LtVVfGrXR7Ofpbi/9bvc2EGYMQC0LCH/gQXoolE=
 github.com/m3db/tools v0.0.0-20181008195521-c6ded3f34878/go.mod h1:TxroQUZzb1wzOsq+4+TfVtT7z89YTz3v2UJAYfLNfLE=
-github.com/m3dbx/pilosa v1.4.1 h1:/Cpp1XAHSd6orpjceXGiKpCoDdYBP5BD/6NoqGG9eVg=
-github.com/m3dbx/pilosa v1.4.1/go.mod h1:Jt0+w9O08sa7qWDeRC58VBjb4OeOTDMOhfvVmyeVCO8=
+github.com/m3dbx/pilosa v1.4.2-0.20201109081833-6c9df43642fd h1:C+RCSMuplTpLH8Fiwb87XIkbYEabVf9itroR9+u4RYo=
+github.com/m3dbx/pilosa v1.4.2-0.20201109081833-6c9df43642fd/go.mod h1:Jt0+w9O08sa7qWDeRC58VBjb4OeOTDMOhfvVmyeVCO8=
 github.com/m3dbx/vellum v0.0.0-20200826162549-f94c029903de h1:C4DpCfTNzJf5RhJqxOtfWAnD2d6ls7KDnK1boBGUnVg=
 github.com/m3dbx/vellum v0.0.0-20200826162549-f94c029903de/go.mod h1:DOTAUfV4bzK6Nrb0dboT/oCG0DnQuX+/n0jfZPh6xxI=
 github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 1aa73d162f..d41b0090f0 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -160,10 +160,12 @@ type nsIndexState struct {
 	blocksByTime map[xtime.UnixNano]index.Block
 	latestBlock  index.Block
 
-	// NB: `blockStartsDescOrder` contains the keys from the map `blocksByTime` in reverse
-	// chronological order. This is used at query time to enforce determinism about results
-	// returned.
-	blockStartsDescOrder []xtime.UnixNano
+	// NB: `blocksDescOrderImmutable` contains the keys from the map
+	// `blocksByTime` in reverse chronological order. This is used at query time
+	// to enforce determinism about results returned.
+	// NB(r): Reference to this slice can be safely taken for iteration purposes
+	// for Query(..) since it is rebuilt each time and immutable once built.
+	blocksDescOrderImmutable []blockAndBlockStart
 
 	// shardsFilterID is set every time the shards change to correctly
 	// only return IDs that this node owns.
@@ -176,6 +178,11 @@ type nsIndexState struct {
 	shardsAssigned map[uint32]struct{}
 }
 
+type blockAndBlockStart struct {
+	block      index.Block
+	blockStart xtime.UnixNano
+}
+
 // NB: nsIndexRuntimeOptions does not contain its own mutex as some of the variables
 // are needed for each index write which already at least acquires read lock from
 // nsIndex mutex, so to keep the lock acquisitions to a minimum these are protected
@@ -514,13 +521,8 @@ func (i *nsIndex) reportStats() error {
 
 	// iterate known blocks in a defined order of time (newest first)
 	// for debug log ordering
-	for _, start := range i.state.blockStartsDescOrder {
-		block, ok := i.state.blocksByTime[start]
-		if !ok {
-			return i.missingBlockInvariantError(start)
-		}
-
-		err := block.Stats(reporter)
+	for _, b := range i.state.blocksDescOrderImmutable {
+		err := b.block.Stats(reporter)
 		if err == index.ErrUnableReportStatsBlockClosed {
 			// Closed blocks are temporarily in the list still
 			continue
@@ -1321,18 +1323,21 @@ func (i *nsIndex) Query(
 	query index.Query,
 	opts index.QueryOptions,
 ) (index.QueryResult, error) {
-	logFields := []opentracinglog.Field{
-		opentracinglog.String("query", query.String()),
-		opentracinglog.String("namespace", i.nsMetadata.ID().String()),
-		opentracinglog.Int("seriesLimit", opts.SeriesLimit),
-		opentracinglog.Int("docsLimit", opts.DocsLimit),
-		xopentracing.Time("queryStart", opts.StartInclusive),
-		xopentracing.Time("queryEnd", opts.EndExclusive),
-	}
-
-	ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxQuery)
-	sp.LogFields(logFields...)
+	var logFields []opentracinglog.Field
+	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQuery)
 	defer sp.Finish()
+	if sampled {
+		// Only allocate metadata such as query string if sampling trace.
+		logFields = []opentracinglog.Field{
+			opentracinglog.String("query", query.String()),
+			opentracinglog.String("namespace", i.nsMetadata.ID().String()),
+			opentracinglog.Int("seriesLimit", opts.SeriesLimit),
+			opentracinglog.Int("docsLimit", opts.DocsLimit),
+			xopentracing.Time("queryStart", opts.StartInclusive),
+			xopentracing.Time("queryEnd", opts.EndExclusive),
+		}
+		sp.LogFields(logFields...)
+	}
 
 	// Get results and set the namespace ID and size limit.
 	results := i.resultsPool.Get()
@@ -1453,9 +1458,12 @@ func (i *nsIndex) query(
 	execBlockFn execBlockQueryFn,
 	logFields []opentracinglog.Field,
 ) (bool, error) {
-	ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxQueryHelper)
-	sp.LogFields(logFields...)
+	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQueryHelper)
 	defer sp.Finish()
+	if sampled {
+		// Only log fields if sampled.
+		sp.LogFields(logFields...)
+	}
 
 	exhaustive, err := i.queryWithSpan(ctx, query, results, opts, execBlockFn, sp, logFields)
 	if err != nil {
@@ -1529,21 +1537,20 @@ func (i *nsIndex) queryWithSpan(
 	opts = i.overriddenOptsForQueryWithRLock(opts)
 	timeout := i.timeoutForQueryWithRLock(ctx)
 
-	// Retrieve blocks to query, then we can release lock
+	// Retrieve blocks to query, then we can release lock.
 	// NB(r): Important not to block ticking, and other tasks by
 	// holding the RLock during a query.
-	blocks, err := i.blocksForQueryWithRLock(xtime.NewRanges(xtime.Range{
+	qryRange := xtime.NewRanges(xtime.Range{
 		Start: opts.StartInclusive,
 		End:   opts.EndExclusive,
-	}))
+	})
+	// NB(r): Safe to take ref to i.state.blocksDescOrderImmutable since it's
+	// immutable and we only create an iterator over it.
+	iter := newBlocksIterStackAlloc(i.state.blocksDescOrderImmutable, qryRange)
 
 	// Can now release the lock and execute the query without holding the lock.
 	i.state.RUnlock()
 
-	if err != nil {
-		return false, err
-	}
-
 	var (
 		// State contains concurrent mutable state for async execution below.
 		state = asyncQueryExecState{
@@ -1558,9 +1565,9 @@ func (i *nsIndex) queryWithSpan(
 	cancellable := resource.NewCancellableLifetime()
 	defer cancellable.Cancel()
 
-	for _, block := range blocks {
+	for iter, ok := iter.Next(); ok; iter, ok = iter.Next() {
 		// Capture block for async query execution below.
-		block := block
+		block := iter.Current()
 
 		// We're looping through all the blocks that we need to query and kicking
 		// off parallel queries which are bounded by the queryWorkersPool's maximum
@@ -1652,7 +1659,7 @@ func (i *nsIndex) queryWithSpan(
 	state.Lock()
 	// Take reference to vars to return while locked.
 	exhaustive := state.exhaustive
-	err = state.multiErr.FinalError()
+	err := state.multiErr.FinalError()
 	state.Unlock()
 
 	if err != nil {
@@ -1818,40 +1825,6 @@ func (i *nsIndex) overriddenOptsForQueryWithRLock(
 	return opts
 }
 
-func (i *nsIndex) blocksForQueryWithRLock(queryRange xtime.Ranges) ([]index.Block, error) {
-	// Chunk the query request into bounds based on applicable blocks and
-	// execute the requests to each of them; and merge results.
-	blocks := make([]index.Block, 0, len(i.state.blockStartsDescOrder))
-
-	// Iterate known blocks in a defined order of time (newest first) to enforce
-	// some determinism about the results returned.
-	for _, start := range i.state.blockStartsDescOrder {
-		// Terminate if queryRange doesn't need any more data
-		if queryRange.IsEmpty() {
-			break
-		}
-
-		block, ok := i.state.blocksByTime[start]
-		if !ok {
-			// This is an invariant, should never occur if state tracking is correct.
-			return nil, i.missingBlockInvariantError(start)
-		}
-
-		// Ensure the block has data requested by the query.
-		blockRange := xtime.Range{Start: block.StartTime(), End: block.EndTime()}
-		if !queryRange.Overlaps(blockRange) {
-			continue
-		}
-
-		// Remove this range from the query range.
-		queryRange.RemoveRange(blockRange)
-
-		blocks = append(blocks, block)
-	}
-
-	return blocks, nil
-}
-
 func (i *nsIndex) ensureBlockPresent(blockStart time.Time) (index.Block, error) {
 	i.state.RLock()
 	defer i.state.RUnlock()
@@ -1931,19 +1904,27 @@ func (i *nsIndex) updateBlockStartsWithLock() {
 		latestBlock      index.Block
 	)
 
-	blockStarts := make([]xtime.UnixNano, 0, len(i.state.blocksByTime))
+	blocks := make([]blockAndBlockStart, 0, len(i.state.blocksByTime))
 	for ts, block := range i.state.blocksByTime {
 		if ts >= latestBlockStart {
 			latestBlock = block
 		}
-		blockStarts = append(blockStarts, ts)
+		blocks = append(blocks, blockAndBlockStart{
+			block:      block,
+			blockStart: ts,
+		})
 	}
 
 	// order in desc order (i.e. reverse chronological)
-	sort.Slice(blockStarts, func(i, j int) bool {
-		return blockStarts[i] > blockStarts[j]
+	sort.Slice(blocks, func(i, j int) bool {
+		return blocks[i].blockStart > blocks[j].blockStart
 	})
-	i.state.blockStartsDescOrder = blockStarts
+	// NB(r): Important not to modify this once set since we take reference
+	// to this slice with an RLock, release with RUnlock and then loop over it
+	// during query time so it must not be altered and stay immutable.
+	// This is done to avoid allocating a copy of the slice at query time for
+	// each query.
+	i.state.blocksDescOrderImmutable = blocks
 
 	// rotate latestBlock
 	i.state.latestBlock = latestBlock
@@ -2130,7 +2111,7 @@ func (i *nsIndex) Close() error {
 
 	i.state.latestBlock = nil
 	i.state.blocksByTime = nil
-	i.state.blockStartsDescOrder = nil
+	i.state.blocksDescOrderImmutable = nil
 
 	if i.runtimeOptsListener != nil {
 		i.runtimeOptsListener.Close()
@@ -2158,14 +2139,6 @@ func (i *nsIndex) Close() error {
 	return multiErr.FinalError()
 }
 
-func (i *nsIndex) missingBlockInvariantError(t xtime.UnixNano) error {
-	err := fmt.Errorf("index query did not find block %d despite seeing it in slice", t)
-	instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
-		l.Error(err.Error())
-	})
-	return err
-}
-
 func (i *nsIndex) unableToAllocBlockInvariantError(err error) error {
 	ierr := fmt.Errorf("index unable to allocate block: %v", err)
 	instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
@@ -2406,3 +2379,56 @@ func (shards dbShards) IDs() []uint32 {
 	}
 	return ids
 }
+
+// blocksIterStackAlloc is a stack allocated block iterator, ensuring no
+// allocations per query.
+type blocksIterStackAlloc struct {
+	blocks      []blockAndBlockStart
+	queryRanges xtime.Ranges
+	idx         int
+}
+
+func newBlocksIterStackAlloc(
+	blocks []blockAndBlockStart,
+	queryRanges xtime.Ranges,
+) blocksIterStackAlloc {
+	return blocksIterStackAlloc{
+		blocks:      blocks,
+		queryRanges: queryRanges,
+		idx:         -1,
+	}
+}
+
+func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) {
+	iter := i
+	if i.queryRanges.IsEmpty() {
+		return iter, false
+	}
+
+	for {
+		iter.idx++
+		if iter.idx >= len(i.blocks) {
+			return iter, false
+		}
+
+		block := i.blocks[iter.idx].block
+
+		// Ensure the block has data requested by the query.
+		blockRange := xtime.Range{
+			Start: block.StartTime(),
+			End:   block.EndTime(),
+		}
+		if !i.queryRanges.Overlaps(blockRange) {
+			continue
+		}
+
+		// Remove this range from the query range.
+		i.queryRanges.RemoveRange(blockRange)
+
+		return iter, true
+	}
+}
+
+func (i blocksIterStackAlloc) Current() index.Block {
+	return i.blocks[i.idx].block
+}
diff --git a/src/dbnode/storage/index_block_test.go b/src/dbnode/storage/index_block_test.go
index a4dc04d9e5..85002a2920 100644
--- a/src/dbnode/storage/index_block_test.go
+++ b/src/dbnode/storage/index_block_test.go
@@ -161,10 +161,10 @@ func TestNamespaceIndexNewBlockFn(t *testing.T) {
 		require.NoError(t, index.Close())
 	}()
 
-	blocksSlice := index.(*nsIndex).state.blockStartsDescOrder
+	blocksSlice := index.(*nsIndex).state.blocksDescOrderImmutable
 
 	require.Equal(t, 1, len(blocksSlice))
-	require.Equal(t, xtime.ToUnixNano(now.Truncate(blockSize)), blocksSlice[0])
+	require.Equal(t, xtime.ToUnixNano(now.Truncate(blockSize)), blocksSlice[0].blockStart)
 
 	require.Equal(t, mockBlock, index.(*nsIndex).state.latestBlock)
 
diff --git a/src/m3ninx/index/segment/builder/terms.go b/src/m3ninx/index/segment/builder/terms.go
index 0a44c4ac04..97dda72ab6 100644
--- a/src/m3ninx/index/segment/builder/terms.go
+++ b/src/m3ninx/index/segment/builder/terms.go
@@ -63,7 +63,6 @@ func (t *terms) post(term []byte, id postings.ID) error {
 			NoCopyKey:     true,
 			NoFinalizeKey: true,
 		})
-
 	}
 
 	// If empty posting list, track insertion of this key into the terms
diff --git a/src/m3ninx/postings/roaring/roaring.go b/src/m3ninx/postings/roaring/roaring.go
index b7c0d8d509..33957a54d6 100644
--- a/src/m3ninx/postings/roaring/roaring.go
+++ b/src/m3ninx/postings/roaring/roaring.go
@@ -177,7 +177,9 @@ func (d *postingsList) RemoveRange(min, max postings.ID) error {
 }
 
 func (d *postingsList) Reset() {
-	d.bitmap.Reset()
+	// NB(r): Use direct remove all to retain allocated containers
+	// on the bitmap.
+	d.bitmap.DirectRemoveAll()
 }
 
 func (d *postingsList) Contains(i postings.ID) bool {

From d0cc13ffac9a8e272cd8a597345a3df9f31cadce Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 9 Nov 2020 03:41:19 -0500
Subject: [PATCH 024/106] Apply capacity fix

---
 src/m3ninx/postings/roaring/bitmap_readonly.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index 8af61656ad..20d412b004 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -228,7 +228,7 @@ func (c readOnlyContainer) bitmap() (bitmapReadOnlyContainer, bool) {
 		return bitmapReadOnlyContainer{}, false
 	}
 	return bitmapReadOnlyContainer{
-		values: (*[0xFFFFFFF]uint64)(unsafe.Pointer(&c.data[c.offset]))[:bitmapN],
+		values: (*[0xFFFFFFF]uint64)(unsafe.Pointer(&c.data[c.offset]))[:bitmapN:bitmapN],
 	}, true
 }
 
@@ -237,7 +237,7 @@ func (c readOnlyContainer) array() (arrayReadOnlyContainer, bool) {
 		return arrayReadOnlyContainer{}, false
 	}
 	return arrayReadOnlyContainer{
-		values: (*[0xFFFFFFF]uint16)(unsafe.Pointer(&c.data[c.offset]))[:c.cardinality],
+		values: (*[0xFFFFFFF]uint16)(unsafe.Pointer(&c.data[c.offset]))[:c.cardinality:c.cardinality],
 	}, true
 }
 
@@ -247,7 +247,7 @@ func (c readOnlyContainer) runs() (runReadOnlyContainer, bool) {
 	}
 	runCount := binary.LittleEndian.Uint16(c.data[c.offset : c.offset+runCountHeaderSize])
 	return runReadOnlyContainer{
-		values: (*[0xFFFFFFF]interval16)(unsafe.Pointer(&c.data[c.offset+runCountHeaderSize]))[:runCount],
+		values: (*[0xFFFFFFF]interval16)(unsafe.Pointer(&c.data[c.offset+runCountHeaderSize]))[:runCount:runCount],
 	}, true
 }
 

From fd2af0c5831e83ddf5c35fc5680330ffbd0fe723 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 9 Nov 2020 19:00:05 -0500
Subject: [PATCH 025/106] Add concurrency for queries with lost of segments and
 pool postings locally in builder

---
 src/dbnode/server/server.go                   |  14 +-
 src/dbnode/storage/index.go                   |   2 +-
 src/dbnode/storage/index/block.go             | 189 +++++++++++++-----
 src/dbnode/storage/index/options.go           |  20 ++
 .../storage/index/read_through_segment.go     |   5 +
 src/dbnode/storage/index/types.go             |   7 +
 src/dbnode/storage/options.go                 |  10 -
 src/dbnode/storage/types.go                   |   7 -
 src/m3ninx/index/segment/builder/builder.go   |   7 +
 .../segment/builder/multi_segments_builder.go |   4 +
 src/m3ninx/index/segment/builder/terms.go     |  22 +-
 .../index/segment/fst/encoding/docs/slice.go  |   5 +
 src/m3ninx/index/segment/fst/segment.go       |  12 ++
 src/m3ninx/index/segment/mem/reader.go        |   9 +
 src/m3ninx/index/types.go                     |   1 +
 15 files changed, 231 insertions(+), 83 deletions(-)

diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index 38cd632674..1626acab51 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -361,14 +361,6 @@ func Run(runOpts RunOptions) {
 
 	opentracing.SetGlobalTracer(tracer)
 
-	if cfg.Index.MaxQueryIDsConcurrency != 0 {
-		queryIDsWorkerPool := xsync.NewWorkerPool(cfg.Index.MaxQueryIDsConcurrency)
-		queryIDsWorkerPool.Init()
-		opts = opts.SetQueryIDsWorkerPool(queryIDsWorkerPool)
-	} else {
-		logger.Warn("max index query IDs concurrency was not set, falling back to default value")
-	}
-
 	buildReporter := instrument.NewBuildReporter(iopts)
 	if err := buildReporter.Start(); err != nil {
 		logger.Fatal("unable to start build reporter", zap.Error(err))
@@ -451,6 +443,12 @@ func Run(runOpts RunOptions) {
 	if cfg.WriteNewSeriesAsync {
 		insertMode = index.InsertAsync
 	}
+	if cfg.Index.MaxQueryIDsConcurrency != 0 {
+		queryWorkerPool := xsync.NewWorkerPool(cfg.Index.MaxQueryIDsConcurrency)
+		queryWorkerPool.Init()
+		indexOpts = indexOpts.SetQueryWorkerPool(queryWorkerPool)
+	}
+
 	indexOpts = indexOpts.SetInsertMode(insertMode).
 		SetPostingsListCache(postingsListCache).
 		SetReadThroughSegmentOptions(index.ReadThroughSegmentOptions{
diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index d41b0090f0..3b081aeb2c 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -356,7 +356,7 @@ func newNamespaceIndexWithOptions(
 		resultsPool:          indexOpts.QueryResultsPool(),
 		aggregateResultsPool: indexOpts.AggregateResultsPool(),
 
-		queryWorkersPool: newIndexOpts.opts.QueryIDsWorkerPool(),
+		queryWorkersPool: newIndexOpts.opts.IndexOptions().QueryWorkerPool(),
 		metrics:          newNamespaceIndexMetrics(indexOpts, instrumentOpts),
 
 		doNotIndexWithFields: doNotIndexWithFields,
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index 62239a47ab..c9f2b6bac6 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -25,6 +25,8 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"math"
+	"sort"
 	"sync"
 	"time"
 
@@ -255,7 +257,6 @@ func NewBlock(
 		docsLimit:                       opts.QueryLimits().DocsLimit(),
 	}
 	b.newFieldsAndTermsIteratorFn = newFieldsAndTermsIterator
-	b.newExecutorWithRLockFn = b.executorWithRLock
 
 	return b, nil
 }
@@ -323,20 +324,6 @@ func (b *block) writesAcceptedWithRLock() bool {
 		b.nsMD.Options().ColdWritesEnabled()
 }
 
-func (b *block) executorWithRLock() (search.Executor, error) {
-	readers, err := b.segmentReadersWithRLock()
-	if err != nil {
-		return nil, err
-	}
-
-	indexReaders := make([]m3ninxindex.Reader, 0, len(readers))
-	for _, r := range readers {
-		indexReaders = append(indexReaders, r)
-	}
-
-	return executor.NewExecutor(indexReaders), nil
-}
-
 func (b *block) segmentReadersWithRLock() ([]segment.Reader, error) {
 	expectedReaders := b.mutableSegments.Len()
 	for _, coldSeg := range b.coldMutableSegments {
@@ -391,53 +378,16 @@ func (b *block) segmentReadersWithRLock() ([]segment.Reader, error) {
 	return readers, nil
 }
 
-// Query acquires a read lock on the block so that the segments
-// are guaranteed to not be freed/released while accumulating results.
-// This allows references to the mmap'd segment data to be accumulated
-// and then copied into the results before this method returns (it is not
-// safe to return docs directly from the segments from this method, the
-// results datastructure is used to copy it every time documents are added
-// to the results datastructure).
-func (b *block) Query(
-	ctx context.Context,
-	cancellable *resource.CancellableLifetime,
-	query Query,
-	opts QueryOptions,
-	results BaseResults,
-	logFields []opentracinglog.Field,
-) (bool, error) {
-	ctx, sp := ctx.StartTraceSpan(tracepoint.BlockQuery)
-	sp.LogFields(logFields...)
-	defer sp.Finish()
-
-	exhaustive, err := b.queryWithSpan(ctx, cancellable, query, opts, results, sp, logFields)
-	if err != nil {
-		sp.LogFields(opentracinglog.Error(err))
-	}
-
-	return exhaustive, err
-}
-
-func (b *block) queryWithSpan(
+func (b *block) queryWithSpanRLock(
 	ctx context.Context,
 	cancellable *resource.CancellableLifetime,
 	query Query,
 	opts QueryOptions,
 	results BaseResults,
 	sp opentracing.Span,
-	logFields []opentracinglog.Field,
+	segmentReaders []m3ninxindex.Reader,
 ) (bool, error) {
-	b.RLock()
-	defer b.RUnlock()
-
-	if b.state == blockStateClosed {
-		return false, ErrUnableToQueryBlockClosed
-	}
-
-	exec, err := b.newExecutorWithRLockFn()
-	if err != nil {
-		return false, err
-	}
+	exec := executor.NewExecutor(segmentReaders)
 
 	// Make sure if we don't register to close the executor later
 	// that we close it before returning.
@@ -522,6 +472,135 @@ func (b *block) queryWithSpan(
 	return opts.exhaustive(size, docsCount), nil
 }
 
+// Query acquires a read lock on the block so that the segments
+// are guaranteed to not be freed/released while accumulating results.
+// This allows references to the mmap'd segment data to be accumulated
+// and then copied into the results before this method returns (it is not
+// safe to return docs directly from the segments from this method, the
+// results datastructure is used to copy it every time documents are added
+// to the results datastructure).
+func (b *block) Query(
+	ctx context.Context,
+	cancellable *resource.CancellableLifetime,
+	query Query,
+	opts QueryOptions,
+	results BaseResults,
+	logFields []opentracinglog.Field,
+) (bool, error) {
+	ctx, sp := ctx.StartTraceSpan(tracepoint.BlockQuery)
+	sp.LogFields(logFields...)
+	defer sp.Finish()
+
+	exhaustive, err := b.queryWithSpan(ctx, cancellable, query, opts, results, sp)
+	if err != nil {
+		sp.LogFields(opentracinglog.Error(err))
+	}
+
+	return exhaustive, err
+}
+
+const (
+	queryGroupReadersParallelism = 8
+	queryGroupSize               = 8
+)
+
+type queryGroup struct {
+	readers    []m3ninxindex.Reader
+	exhaustive bool
+	err        error
+}
+
+func (b *block) queryWithSpan(
+	ctx context.Context,
+	cancellable *resource.CancellableLifetime,
+	query Query,
+	opts QueryOptions,
+	results BaseResults,
+	sp opentracing.Span,
+) (bool, error) {
+	b.RLock()
+	defer b.RUnlock()
+
+	if b.state == blockStateClosed {
+		return false, ErrUnableToQueryBlockClosed
+	}
+
+	readers, err := b.segmentReadersWithRLock()
+	if err != nil {
+		return false, err
+	}
+
+	segmentReaders := make([]m3ninxindex.Reader, 0, len(readers))
+	for _, reader := range readers {
+		segmentReaders = append(segmentReaders, reader)
+	}
+
+	if len(segmentReaders) < queryGroupReadersParallelism {
+		// Query no parallelism.
+		return b.queryWithSpanRLock(ctx, cancellable, query, opts,
+			results, sp, segmentReaders)
+	}
+
+	var (
+		groupsN = int(math.Ceil(float64(len(readers)) / float64(queryGroupSize)))
+		groups  = make([]queryGroup, groupsN)
+		jobs    = make([]m3ninxindex.Reader, groupsN*queryGroupSize)
+		workers = b.opts.QueryWorkerPool()
+		wg      sync.WaitGroup
+	)
+	// Create query group jobs.
+	for i := 0; i < groupsN; i++ {
+		groupJobs := jobs[:queryGroupSize]
+		jobs = jobs[queryGroupSize:]
+		groups[i] = queryGroup{
+			// Jobs backed by single bulk alloc slice, but start zero length.
+			readers: groupJobs[:0],
+		}
+	}
+	// Allocate jobs to groups, first sort by size.
+	sort.Slice(segmentReaders, func(i, j int) bool {
+		nI, _ := segmentReaders[i].NumDocs()
+		nJ, _ := segmentReaders[j].NumDocs()
+		return nI < nJ
+	})
+	// Now allocate round robin.
+	for i, reader := range segmentReaders {
+		group := i % groupsN
+		groups[group].readers = append(groups[group].readers, reader)
+	}
+
+	// Launch async queries.
+	for i := 1; i < groupsN; i++ {
+		i := i
+		wg.Add(1)
+		workers.Go(func() {
+			exhaustive, err := b.queryWithSpanRLock(ctx, cancellable, query, opts,
+				results, sp, groups[i].readers)
+			groups[i].exhaustive, groups[i].err = exhaustive, err
+			wg.Done()
+		})
+	}
+
+	// Save an extra goroutine to execute synchronously on local goroutine.
+	exhaustive, err := b.queryWithSpanRLock(ctx, cancellable, query, opts,
+		results, sp, groups[0].readers)
+	if err != nil {
+		return false, err
+	}
+
+	// Wait for others.
+	wg.Wait()
+
+	// Collate exhaustive.
+	for i := 1; i < groupsN; i++ {
+		if err := groups[i].err; err != nil {
+			return false, err
+		}
+		exhaustive = exhaustive && groups[i].exhaustive
+	}
+	return exhaustive, nil
+}
+
 func (b *block) closeAsync(closer io.Closer) {
 	if err := closer.Close(); err != nil {
 		// Note: This only happens if closing the readers isn't clean.
diff --git a/src/dbnode/storage/index/options.go b/src/dbnode/storage/index/options.go
index ba5243272e..3daf1a63c0 100644
--- a/src/dbnode/storage/index/options.go
+++ b/src/dbnode/storage/index/options.go
@@ -22,6 +22,8 @@ package index
 
 import (
 	"errors"
+	"math"
+	"runtime"
 
 	"github.com/m3db/m3/src/dbnode/clock"
 	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
@@ -34,6 +36,7 @@ import (
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	"github.com/m3db/m3/src/x/pool"
+	xsync "github.com/m3db/m3/src/x/sync"
 )
 
 const (
@@ -125,6 +128,7 @@ type opts struct {
 	readThroughSegmentOptions       ReadThroughSegmentOptions
 	mmapReporter                    mmap.Reporter
 	queryLimits                     limits.QueryLimits
+	queryWorkerPool                 xsync.WorkerPool
 }
 
 var undefinedUUIDFn = func() ([]byte, error) { return nil, errIDGenerationDisabled }
@@ -159,6 +163,11 @@ func NewOptions() Options {
 	aggResultsEntryArrayPool.Init()
 
 	instrumentOpts := instrument.NewOptions()
+
+	// Default to using half of the available cores for querying.
+	queryWorkerPool := xsync.NewWorkerPool(int(math.Ceil(float64(runtime.NumCPU()) / 2)))
+	queryWorkerPool.Init()
+
 	opts := &opts{
 		insertMode:                      defaultIndexInsertMode,
 		clockOpts:                       clock.NewOptions(),
@@ -176,6 +185,7 @@ func NewOptions() Options {
 		foregroundCompactionPlannerOpts: defaultForegroundCompactionOpts,
 		backgroundCompactionPlannerOpts: defaultBackgroundCompactionOpts,
 		queryLimits:                     limits.NoOpQueryLimits(),
+		queryWorkerPool:                 queryWorkerPool,
 	}
 	resultsPool.Init(func() QueryResults {
 		return NewQueryResults(nil, QueryResultsOptions{}, opts)
@@ -428,3 +438,13 @@ func (o *opts) SetQueryLimits(value limits.QueryLimits) Options {
 func (o *opts) QueryLimits() limits.QueryLimits {
 	return o.queryLimits
 }
+
+func (o *opts) SetQueryWorkerPool(value xsync.WorkerPool) Options {
+	opts := *o
+	opts.queryWorkerPool = value
+	return &opts
+}
+
+func (o *opts) QueryWorkerPool() xsync.WorkerPool {
+	return o.queryWorkerPool
+}
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index 8d93fe1a13..b234ba275b 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -267,6 +267,11 @@ func (s *readThroughSegmentReader) Doc(id postings.ID) (doc.Document, error) {
 	return s.reader.Doc(id)
 }
 
+// NumDocs is a pass through call, since there's no postings list to cache.
+func (s *readThroughSegmentReader) NumDocs() (int, error) {
+	return s.reader.NumDocs()
+}
+
 // Docs is a pass through call, since there's no postings list to cache.
 func (s *readThroughSegmentReader) Docs(pl postings.List) (doc.Iterator, error) {
 	return s.reader.Docs(pl)
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index 88323e7909..5e841fe558 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -42,6 +42,7 @@ import (
 	"github.com/m3db/m3/src/x/mmap"
 	"github.com/m3db/m3/src/x/pool"
 	"github.com/m3db/m3/src/x/resource"
+	xsync "github.com/m3db/m3/src/x/sync"
 	xtime "github.com/m3db/m3/src/x/time"
 
 	opentracinglog "github.com/opentracing/opentracing-go/log"
@@ -989,4 +990,10 @@ type Options interface {
 
 	// QueryLimits returns the current query limits.
 	QueryLimits() limits.QueryLimits
+
+	// SetQueryWorkerPool sets the QueryIDs worker pool.
+	SetQueryWorkerPool(value xsync.WorkerPool) Options
+
+	// QueryWorkerPool returns the QueryIDs worker pool.
+	QueryWorkerPool() xsync.WorkerPool
 }
diff --git a/src/dbnode/storage/options.go b/src/dbnode/storage/options.go
index 3beba1a5a4..310d78d42f 100644
--- a/src/dbnode/storage/options.go
+++ b/src/dbnode/storage/options.go
@@ -681,16 +681,6 @@ func (o *options) FetchBlocksMetadataResultsPool() block.FetchBlocksMetadataResu
 	return o.fetchBlocksMetadataResultsPool
 }
 
-func (o *options) SetQueryIDsWorkerPool(value xsync.WorkerPool) Options {
-	opts := *o
-	opts.queryIDsWorkerPool = value
-	return &opts
-}
-
-func (o *options) QueryIDsWorkerPool() xsync.WorkerPool {
-	return o.queryIDsWorkerPool
-}
-
 func (o *options) SetWriteBatchPool(value *writes.WriteBatchPool) Options {
 	opts := *o
 	opts.writeBatchPool = value
diff --git a/src/dbnode/storage/types.go b/src/dbnode/storage/types.go
index b7753bc9f8..c868a36994 100644
--- a/src/dbnode/storage/types.go
+++ b/src/dbnode/storage/types.go
@@ -52,7 +52,6 @@ import (
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	"github.com/m3db/m3/src/x/pool"
-	xsync "github.com/m3db/m3/src/x/sync"
 	xtime "github.com/m3db/m3/src/x/time"
 )
 
@@ -1214,12 +1213,6 @@ type Options interface {
 	// FetchBlocksMetadataResultsPool returns the fetchBlocksMetadataResultsPool.
 	FetchBlocksMetadataResultsPool() block.FetchBlocksMetadataResultsPool
 
-	// SetQueryIDsWorkerPool sets the QueryIDs worker pool.
-	SetQueryIDsWorkerPool(value xsync.WorkerPool) Options
-
-	// QueryIDsWorkerPool returns the QueryIDs worker pool.
-	QueryIDsWorkerPool() xsync.WorkerPool
-
 	// SetWriteBatchPool sets the WriteBatch pool.
 	SetWriteBatchPool(value *writes.WriteBatchPool) Options
 
diff --git a/src/m3ninx/index/segment/builder/builder.go b/src/m3ninx/index/segment/builder/builder.go
index ac1cd597e5..81bf308a79 100644
--- a/src/m3ninx/index/segment/builder/builder.go
+++ b/src/m3ninx/index/segment/builder/builder.go
@@ -497,6 +497,13 @@ func (b *builder) Doc(id postings.ID) (doc.Document, error) {
 	return b.docs[idx], nil
 }
 
+func (b *builder) NumDocs() (int, error) {
+	b.status.RLock()
+	defer b.status.RUnlock()
+
+	return len(b.docs), nil
+}
+
 func (b *builder) Docs() []doc.Document {
 	b.status.RLock()
 	defer b.status.RUnlock()
diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go
index 459b93f524..e9779b79dd 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_builder.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go
@@ -162,6 +162,10 @@ func (b *builderFromSegments) Doc(id postings.ID) (doc.Document, error) {
 	return b.docs[idx], nil
 }
 
+func (b *builderFromSegments) NumDocs() (int, error) {
+	return len(b.docs), nil
+}
+
 func (b *builderFromSegments) FieldsIterable() segment.FieldsIterable {
 	return b
 }
diff --git a/src/m3ninx/index/segment/builder/terms.go b/src/m3ninx/index/segment/builder/terms.go
index 97dda72ab6..92f3db6d89 100644
--- a/src/m3ninx/index/segment/builder/terms.go
+++ b/src/m3ninx/index/segment/builder/terms.go
@@ -30,6 +30,7 @@ import (
 type terms struct {
 	opts                Options
 	pool                postings.Pool
+	poolLocal           []postings.MutableList
 	postings            *PostingsMap
 	postingsListUnion   postings.MutableList
 	uniqueTerms         []termElem
@@ -55,10 +56,27 @@ func (t *terms) size() int {
 	return len(t.uniqueTerms)
 }
 
+func (t *terms) poolGet() postings.MutableList {
+	if len(t.poolLocal) == 0 {
+		return t.pool.Get()
+	}
+
+	last := len(t.poolLocal) - 1
+	elem := t.poolLocal[last]
+	t.poolLocal = t.poolLocal[:last]
+
+	return elem
+}
+
+func (t *terms) poolPut(v postings.MutableList) {
+	v.Reset()
+	t.poolLocal = append(t.poolLocal, v)
+}
+
 func (t *terms) post(term []byte, id postings.ID) error {
 	postingsList, ok := t.postings.Get(term)
 	if !ok {
-		postingsList = t.pool.Get()
+		postingsList = t.poolGet()
 		t.postings.SetUnsafe(term, postingsList, PostingsMapSetUnsafeOptions{
 			NoCopyKey:     true,
 			NoFinalizeKey: true,
@@ -106,7 +124,7 @@ func (t *terms) sortIfRequired() {
 func (t *terms) reset() {
 	// Keep postings map lookup, return postings lists to pool
 	for _, entry := range t.postings.Iter() {
-		t.pool.Put(entry.Value())
+		t.poolPut(entry.Value())
 	}
 	t.postings.Reset()
 	t.postingsListUnion.Reset()
diff --git a/src/m3ninx/index/segment/fst/encoding/docs/slice.go b/src/m3ninx/index/segment/fst/encoding/docs/slice.go
index ccf7fd48b6..de9a3c9df3 100644
--- a/src/m3ninx/index/segment/fst/encoding/docs/slice.go
+++ b/src/m3ninx/index/segment/fst/encoding/docs/slice.go
@@ -66,6 +66,11 @@ func (r *SliceReader) Doc(id postings.ID) (doc.Document, error) {
 	return r.Read(id)
 }
 
+// NumDocs returns number of docs.
+func (r *SliceReader) NumDocs() (int, error) {
+	return len(r.docs), nil
+}
+
 // Iter returns a docs iterator.
 func (r *SliceReader) Iter() index.IDDocIterator {
 	postingsIter := postings.NewRangeIterator(0, postings.ID(r.Len()))
diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index e3f6f1f2bf..39edfea328 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -1143,6 +1143,18 @@ func (sr *fsSegmentReader) Doc(id postings.ID) (doc.Document, error) {
 	return pl, err
 }
 
+func (sr *fsSegmentReader) NumDocs() (int, error) {
+	if sr.closed {
+		return 0, errReaderClosed
+	}
+	// NB(r): We are allowed to call match field after Close called on
+	// the segment but not after it is finalized.
+	sr.fsSegment.RLock()
+	n := sr.fsSegment.numDocs
+	sr.fsSegment.RUnlock()
+	return int(n), nil
+}
+
 func (sr *fsSegmentReader) Docs(pl postings.List) (doc.Iterator, error) {
 	if sr.closed {
 		return nil, errReaderClosed
diff --git a/src/m3ninx/index/segment/mem/reader.go b/src/m3ninx/index/segment/mem/reader.go
index e77a72ece6..7b0a6496aa 100644
--- a/src/m3ninx/index/segment/mem/reader.go
+++ b/src/m3ninx/index/segment/mem/reader.go
@@ -163,6 +163,15 @@ func (r *reader) AllDocs() (index.IDDocIterator, error) {
 	return r.getDocIterWithLock(pi), nil
 }
 
+func (r *reader) NumDocs() (int, error) {
+	r.RLock()
+	defer r.RUnlock()
+	if r.closed {
+		return 0, errSegmentReaderClosed
+	}
+	return len(r.segment.docs.data), nil
+}
+
 func (r *reader) getDocIterWithLock(iter postings.Iterator) index.IDDocIterator {
 	return index.NewIDDocIterator(r, iter)
 }
diff --git a/src/m3ninx/index/types.go b/src/m3ninx/index/types.go
index 7dec18a368..2d5fefae7f 100644
--- a/src/m3ninx/index/types.go
+++ b/src/m3ninx/index/types.go
@@ -97,6 +97,7 @@ type CompiledRegex struct {
 // DocRetriever returns the document associated with a postings ID. It returns
 // ErrDocNotFound if there is no document corresponding to the given postings ID.
 type DocRetriever interface {
+	NumDocs() (int, error)
 	Doc(id postings.ID) (doc.Document, error)
 }
 

From d82a47d0b0353d9fed1c16f50a5940b5e69c636d Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 10 Nov 2020 18:59:03 -0500
Subject: [PATCH 026/106] Use separate pools for querying segments separately

---
 src/dbnode/server/server.go         |  10 ++-
 src/dbnode/storage/index/block.go   | 115 +++++++++++++++-------------
 src/dbnode/storage/index/options.go |  36 ++++++---
 src/dbnode/storage/index/types.go   |  14 +++-
 src/x/sync/types.go                 |   9 +++
 src/x/sync/worker_pool.go           |   8 ++
 6 files changed, 123 insertions(+), 69 deletions(-)

diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index 76e85bb3f4..39ed4f939b 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -452,9 +452,13 @@ func Run(runOpts RunOptions) {
 		insertMode = index.InsertAsync
 	}
 	if cfg.Index.MaxQueryIDsConcurrency != 0 {
-		queryWorkerPool := xsync.NewWorkerPool(cfg.Index.MaxQueryIDsConcurrency)
-		queryWorkerPool.Init()
-		indexOpts = indexOpts.SetQueryWorkerPool(queryWorkerPool)
+		queryBlockSegmentWorkerPool := xsync.NewWorkerPool(cfg.Index.MaxQueryIDsConcurrency)
+		queryBlockSegmentWorkerPool.Init()
+		queryBlockWorkerPool := xsync.NewWorkerPool(2 * cfg.Index.MaxQueryIDsConcurrency)
+		queryBlockWorkerPool.Init()
+		indexOpts = indexOpts.
+			SetQueryBlockSegmentWorkerPool(queryBlockSegmentWorkerPool).
+			SetQueryBlockWorkerPool(queryBlockWorkerPool)
 	}
 
 	indexOpts = indexOpts.SetInsertMode(insertMode).
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index d4c2742a85..9054348845 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -47,9 +47,9 @@ import (
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/resource"
 	xresource "github.com/m3db/m3/src/x/resource"
+	xsync "github.com/m3db/m3/src/x/sync"
 	xtime "github.com/m3db/m3/src/x/time"
 
-	"github.com/opentracing/opentracing-go"
 	opentracinglog "github.com/opentracing/opentracing-go/log"
 	"github.com/uber-go/tally"
 	"go.uber.org/zap"
@@ -146,6 +146,7 @@ type block struct {
 	namespaceRuntimeOptsMgr         namespace.RuntimeOptionsManager
 	queryLimits                     limits.QueryLimits
 	docsLimit                       limits.LookbackLimit
+	querySegmentsWorkers            xsync.WorkerPool
 
 	metrics blockMetrics
 	logger  *zap.Logger
@@ -256,6 +257,7 @@ func NewBlock(
 		logger:                          iopts.Logger(),
 		queryLimits:                     opts.QueryLimits(),
 		docsLimit:                       opts.QueryLimits().DocsLimit(),
+		querySegmentsWorkers:            opts.QueryBlockSegmentsWorkerPool(),
 	}
 	b.newFieldsAndTermsIteratorFn = newFieldsAndTermsIterator
 
@@ -400,7 +402,7 @@ func (b *block) Query(
 		sp.LogFields(logFields...)
 	}
 
-	exhaustive, err := b.queryWithSpan(ctx, cancellable, query, opts, results)
+	exhaustive, err := b.queryNoLock(ctx, cancellable, query, opts, results)
 	if err != nil {
 		sp.LogFields(opentracinglog.Error(err))
 	}
@@ -419,21 +421,25 @@ type queryGroup struct {
 	err        error
 }
 
-func (b *block) queryWithSpan(
-	ctx context.Context,
-	cancellable *resource.CancellableLifetime,
-	query Query,
-	opts QueryOptions,
-	results BaseResults,
-) (bool, error) {
+func (b *block) segmentReadersNoLock() ([]segment.Reader, error) {
 	b.RLock()
 	defer b.RUnlock()
 
 	if b.state == blockStateClosed {
-		return false, ErrUnableToQueryBlockClosed
+		return nil, ErrUnableToQueryBlockClosed
 	}
 
-	readers, err := b.segmentReadersWithRLock()
+	return b.segmentReadersWithRLock()
+}
+
+func (b *block) queryNoLock(
+	ctx context.Context,
+	cancellable *resource.CancellableLifetime,
+	query Query,
+	opts QueryOptions,
+	results BaseResults,
+) (bool, error) {
+	readers, err := b.segmentReadersNoLock()
 	if err != nil {
 		return false, err
 	}
@@ -444,16 +450,19 @@ func (b *block) queryWithSpan(
 	}
 
 	if len(segmentReaders) < queryGroupReadersParallelism {
-		// Query no parallelism.
-		return b.queryWithSpanAndRLock(ctx, cancellable, query,
+		// Query no parallelism, but ensure not to overwhlem by limiting
+		// concurrency to the query segments worker pool.
+		b.querySegmentsWorkers.GetToken()
+		exhaustive, err := b.queryReadersNoLock(ctx, cancellable, query,
 			opts, results, segmentReaders)
+		b.querySegmentsWorkers.PutToken()
+		return exhaustive, err
 	}
 
 	var (
 		groupsN = int(math.Ceil(float64(len(readers)) / float64(queryGroupSize)))
 		groups  = make([]queryGroup, groupsN)
 		jobs    = make([]m3ninxindex.Reader, groupsN*queryGroupSize)
-		workers = b.opts.QueryWorkerPool()
 		wg      sync.WaitGroup
 	)
 	// Create query group jobs.
@@ -481,8 +490,8 @@ func (b *block) queryWithSpan(
 	for i := 1; i < groupsN; i++ {
 		i := i
 		wg.Add(1)
-		workers.Go(func() {
-			exhaustive, err := b.queryWithSpanAndRLock(ctx, cancellable, query,
+		b.querySegmentsWorkers.Go(func() {
+			exhaustive, err := b.queryReadersNoLock(ctx, cancellable, query,
 				opts, results, groups[i].readers)
 			groups[i].exhaustive, groups[i].err = exhaustive, err
 			wg.Done()
@@ -490,8 +499,10 @@ func (b *block) queryWithSpan(
 	}
 
 	// Save an extra goroutine to execute synchronously on local goroutine.
-	exhaustive, err := b.queryWithSpanAndRLock(ctx, cancellable, query,
+	b.querySegmentsWorkers.GetToken()
+	exhaustive, err := b.queryReadersNoLock(ctx, cancellable, query,
 		opts, results, groups[0].readers)
+	b.querySegmentsWorkers.PutToken()
 	if err != nil {
 		return false, err
 	}
@@ -509,7 +520,7 @@ func (b *block) queryWithSpan(
 	return exhaustive, nil
 }
 
-func (b *block) queryWithSpanAndRLock(
+func (b *block) queryReadersNoLock(
 	ctx context.Context,
 	cancellable *xresource.CancellableLifetime,
 	query Query,
@@ -524,16 +535,10 @@ func (b *block) queryWithSpanAndRLock(
 	execCloseRegistered := false
 	defer func() {
 		if !execCloseRegistered {
-			b.closeAsync(exec)
+			b.closeAsyncNoLock(exec)
 		}
 	}()
 
-	// FOLLOWUP(prateek): push down QueryOptions to restrict results
-	iter, err := exec.Execute(query.Query.SearchQuery())
-	if err != nil {
-		return false, err
-	}
-
 	// Register the executor to close when context closes
 	// so can avoid copying the results into the map and just take
 	// references to it.
@@ -546,10 +551,17 @@ func (b *block) queryWithSpanAndRLock(
 	}
 	execCloseRegistered = true // Make sure to not locally close it.
 	ctx.RegisterFinalizer(xresource.FinalizerFn(func() {
-		b.closeAsync(exec)
+		b.closeAsyncNoLock(exec)
 	}))
 	cancellable.ReleaseCheckout()
 
+	// Perform actual search to start iteration.
+	// FOLLOWUP(prateek): push down QueryOptions to restrict results
+	iter, err := exec.Execute(query.Query.SearchQuery())
+	if err != nil {
+		return false, err
+	}
+
 	var (
 		iterCloser = safeCloser{closable: iter}
 		size       = results.Size()
@@ -578,7 +590,8 @@ func (b *block) queryWithSpanAndRLock(
 			continue
 		}
 
-		batch, size, docsCount, err = b.addQueryResults(cancellable, results, batch)
+		batch, size, docsCount, err = b.addQueryResultsNoLock(cancellable,
+			results, batch)
 		if err != nil {
 			return false, err
 		}
@@ -586,7 +599,8 @@ func (b *block) queryWithSpanAndRLock(
 
 	// Add last batch to results if remaining.
 	if len(batch) > 0 {
-		batch, size, docsCount, err = b.addQueryResults(cancellable, results, batch)
+		batch, size, docsCount, err = b.addQueryResultsNoLock(cancellable,
+			results, batch)
 		if err != nil {
 			return false, err
 		}
@@ -602,14 +616,14 @@ func (b *block) queryWithSpanAndRLock(
 	return opts.exhaustive(size, docsCount), nil
 }
 
-func (b *block) closeAsync(closer io.Closer) {
+func (b *block) closeAsyncNoLock(closer io.Closer) {
 	if err := closer.Close(); err != nil {
 		// Note: This only happens if closing the readers isn't clean.
 		b.logger.Error("could not close query index block resource", zap.Error(err))
 	}
 }
 
-func (b *block) addQueryResults(
+func (b *block) addQueryResultsNoLock(
 	cancellable *xresource.CancellableLifetime,
 	results BaseResults,
 	batch []doc.Document,
@@ -661,7 +675,7 @@ func (b *block) Aggregate(
 	sp.LogFields(logFields...)
 	defer sp.Finish()
 
-	exhaustive, err := b.aggregateWithSpan(ctx, cancellable, opts, results, sp)
+	exhaustive, err := b.aggregateNoLock(ctx, cancellable, opts, results)
 	if err != nil {
 		sp.LogFields(opentracinglog.Error(err))
 	}
@@ -669,18 +683,15 @@ func (b *block) Aggregate(
 	return exhaustive, err
 }
 
-func (b *block) aggregateWithSpan(
+func (b *block) aggregateNoLock(
 	ctx context.Context,
 	cancellable *xresource.CancellableLifetime,
 	opts QueryOptions,
 	results AggregateResults,
-	sp opentracing.Span,
 ) (bool, error) {
-	b.RLock()
-	defer b.RUnlock()
-
-	if b.state == blockStateClosed {
-		return false, ErrUnableToQueryBlockClosed
+	readers, err := b.segmentReadersNoLock()
+	if err != nil {
+		return false, err
 	}
 
 	aggOpts := results.AggregateResultsOptions()
@@ -720,7 +731,7 @@ func (b *block) aggregateWithSpan(
 
 	var (
 		size       = results.Size()
-		docsCount  = results.TotalDocsCount()
+		docsN      = results.TotalDocsCount()
 		batch      = b.opts.AggregateResultsEntryArrayPool().Get()
 		batchSize  = cap(batch)
 		iterClosed = false // tracking whether we need to free the iterator at the end.
@@ -737,23 +748,18 @@ func (b *block) aggregateWithSpan(
 		}
 	}()
 
-	readers, err := b.segmentReadersWithRLock()
-	if err != nil {
-		return false, err
-	}
-
 	// Make sure to close readers at end of query since results can
 	// include references to the underlying bytes from the index segment
 	// read by the readers.
 	for _, reader := range readers {
 		reader := reader // Capture for inline function.
 		ctx.RegisterFinalizer(xresource.FinalizerFn(func() {
-			b.closeAsync(reader)
+			b.closeAsyncNoLock(reader)
 		}))
 	}
 
 	for _, reader := range readers {
-		if opts.LimitsExceeded(size, docsCount) {
+		if opts.LimitsExceeded(size, docsN) {
 			break
 		}
 
@@ -763,17 +769,19 @@ func (b *block) aggregateWithSpan(
 		}
 
 		for iter.Next() {
-			if opts.LimitsExceeded(size, docsCount) {
+			if opts.LimitsExceeded(size, docsN) {
 				break
 			}
 
 			field, term := iter.Current()
-			batch = b.appendFieldAndTermToBatch(batch, field, term, iterateTerms)
+			batch = b.appendFieldAndTermToBatchNoLock(batch, field, term,
+				iterateTerms)
 			if len(batch) < batchSize {
 				continue
 			}
 
-			batch, size, docsCount, err = b.addAggregateResults(cancellable, results, batch)
+			batch, size, docsN, err = b.addAggregateResultsNoLock(cancellable,
+				results, batch)
 			if err != nil {
 				return false, err
 			}
@@ -792,16 +800,17 @@ func (b *block) aggregateWithSpan(
 
 	// Add last batch to results if remaining.
 	if len(batch) > 0 {
-		batch, size, docsCount, err = b.addAggregateResults(cancellable, results, batch)
+		batch, size, docsN, err = b.addAggregateResultsNoLock(cancellable,
+			results, batch)
 		if err != nil {
 			return false, err
 		}
 	}
 
-	return opts.exhaustive(size, docsCount), nil
+	return opts.exhaustive(size, docsN), nil
 }
 
-func (b *block) appendFieldAndTermToBatch(
+func (b *block) appendFieldAndTermToBatchNoLock(
 	batch []AggregateResultsEntry,
 	field, term []byte,
 	includeTerms bool,
@@ -871,7 +880,7 @@ func (b *block) pooledID(id []byte) ident.ID {
 	return b.opts.IdentifierPool().BinaryID(data)
 }
 
-func (b *block) addAggregateResults(
+func (b *block) addAggregateResultsNoLock(
 	cancellable *xresource.CancellableLifetime,
 	results AggregateResults,
 	batch []AggregateResultsEntry,
diff --git a/src/dbnode/storage/index/options.go b/src/dbnode/storage/index/options.go
index 4de80d3a8a..e27f8d729e 100644
--- a/src/dbnode/storage/index/options.go
+++ b/src/dbnode/storage/index/options.go
@@ -128,7 +128,8 @@ type opts struct {
 	readThroughSegmentOptions       ReadThroughSegmentOptions
 	mmapReporter                    mmap.Reporter
 	queryLimits                     limits.QueryLimits
-	queryWorkerPool                 xsync.WorkerPool
+	queryBlockWorkerPool            xsync.WorkerPool
+	queryBlockSegmentWorkerPool     xsync.WorkerPool
 }
 
 var undefinedUUIDFn = func() ([]byte, error) { return nil, errIDGenerationDisabled }
@@ -164,9 +165,15 @@ func NewOptions() Options {
 
 	instrumentOpts := instrument.NewOptions()
 
-	// Default to using half of the available cores for querying.
-	queryWorkerPool := xsync.NewWorkerPool(int(math.Ceil(float64(runtime.NumCPU()) / 2)))
-	queryWorkerPool.Init()
+	// Default to using half of the available cores for querying segments,
+	// and 2x that for always have a block pending to be queried ready to go.
+	queryBlockSegmentsConcurrency := int(math.Ceil(float64(runtime.NumCPU()) / 2.0))
+
+	queryBlockSegmentWorkerPool := xsync.NewWorkerPool(queryBlockSegmentsConcurrency)
+	queryBlockSegmentWorkerPool.Init()
+
+	queryBlockWorkerPool := xsync.NewWorkerPool(2 * queryBlockSegmentsConcurrency)
+	queryBlockWorkerPool.Init()
 
 	opts := &opts{
 		insertMode:                      defaultIndexInsertMode,
@@ -185,7 +192,8 @@ func NewOptions() Options {
 		foregroundCompactionPlannerOpts: defaultForegroundCompactionOpts,
 		backgroundCompactionPlannerOpts: defaultBackgroundCompactionOpts,
 		queryLimits:                     limits.NoOpQueryLimits(),
-		queryWorkerPool:                 queryWorkerPool,
+		queryBlockWorkerPool:            queryBlockWorkerPool,
+		queryBlockSegmentWorkerPool:     queryBlockSegmentWorkerPool,
 	}
 	resultsPool.Init(func() QueryResults {
 		return NewQueryResults(nil, QueryResultsOptions{}, opts)
@@ -439,12 +447,22 @@ func (o *opts) QueryLimits() limits.QueryLimits {
 	return o.queryLimits
 }
 
-func (o *opts) SetQueryWorkerPool(value xsync.WorkerPool) Options {
+func (o *opts) SetQueryBlockWorkerPool(value xsync.WorkerPool) Options {
+	opts := *o
+	opts.queryBlockWorkerPool = value
+	return &opts
+}
+
+func (o *opts) QueryBlockWorkerPool() xsync.WorkerPool {
+	return o.queryBlockWorkerPool
+}
+
+func (o *opts) SetQueryBlockSegmentWorkerPool(value xsync.WorkerPool) Options {
 	opts := *o
-	opts.queryWorkerPool = value
+	opts.queryBlockSegmentWorkerPool = value
 	return &opts
 }
 
-func (o *opts) QueryWorkerPool() xsync.WorkerPool {
-	return o.queryWorkerPool
+func (o *opts) QueryBlockSegmentWorkerPool() xsync.WorkerPool {
+	return o.queryBlockSegmentWorkerPool
 }
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index 662e0f09f3..104f752556 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -991,9 +991,15 @@ type Options interface {
 	// QueryLimits returns the current query limits.
 	QueryLimits() limits.QueryLimits
 
-	// SetQueryWorkerPool sets the QueryIDs worker pool.
-	SetQueryWorkerPool(value xsync.WorkerPool) Options
+	// SetQueryBlockWorkerPool sets the query block worker pool.
+	SetQueryBlockWorkerPool(value xsync.WorkerPool) Options
 
-	// QueryWorkerPool returns the QueryIDs worker pool.
-	QueryWorkerPool() xsync.WorkerPool
+	// QueryBlockWorkerPool returns the query block worker pool.
+	QueryBlockWorkerPool() xsync.WorkerPool
+
+	// SetQueryBlockSegmentWorkerPool sets the query block segment worker pool.
+	SetQueryBlockSegmentWorkerPool(value xsync.WorkerPool) Options
+
+	// QueryBlockSegmentWorkerPool returns the query block segment worker pool.
+	QueryBlockSegmentWorkerPool() xsync.WorkerPool
 }
diff --git a/src/x/sync/types.go b/src/x/sync/types.go
index 6f96fc4b3f..750ce94f85 100644
--- a/src/x/sync/types.go
+++ b/src/x/sync/types.go
@@ -82,6 +82,15 @@ type WorkerPool interface {
 	// available, returning true if a worker becomes available, or false
 	// otherwise.
 	GoWithTimeout(work Work, timeout time.Duration) bool
+
+	// GetToken reserves the current goroutine to take one of the allocated
+	// goroutine concurrency reserved for the pool in a blocking manner.
+	// Useful for bounding goroutines when already using an allocated
+	// goroutine about to do some work.
+	GetToken()
+
+	// PutToken returns a token reserved by GetToken.
+	PutToken()
 }
 
 // PooledWorkerPoolOptions is the options for a PooledWorkerPool.
diff --git a/src/x/sync/worker_pool.go b/src/x/sync/worker_pool.go
index 0e6ea6efca..11f80c4b76 100644
--- a/src/x/sync/worker_pool.go
+++ b/src/x/sync/worker_pool.go
@@ -61,6 +61,14 @@ func (p *workerPool) GoIfAvailable(work Work) bool {
 	}
 }
 
+func (p *workerPool) GetToken() {
+	<-p.workCh
+}
+
+func (p *workerPool) PutToken() {
+	p.workCh <- struct{}{}
+}
+
 func (p *workerPool) GoWithTimeout(work Work, timeout time.Duration) bool {
 	// Attempt to try writing without allocating a ticker.
 	select {

From e113d5ba08accee7b21ea44e4467aee4734f3ec7 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 10 Nov 2020 19:00:16 -0500
Subject: [PATCH 027/106] Fix build

---
 src/dbnode/storage/index.go       | 2 +-
 src/dbnode/storage/index/block.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 77ea407932..f00f1e84d7 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -355,7 +355,7 @@ func newNamespaceIndexWithOptions(
 		resultsPool:          indexOpts.QueryResultsPool(),
 		aggregateResultsPool: indexOpts.AggregateResultsPool(),
 
-		queryWorkersPool: newIndexOpts.opts.IndexOptions().QueryWorkerPool(),
+		queryWorkersPool: newIndexOpts.opts.IndexOptions().QueryBlockWorkerPool(),
 		metrics:          newNamespaceIndexMetrics(indexOpts, instrumentOpts),
 
 		doNotIndexWithFields: doNotIndexWithFields,
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index 9054348845..b5a46ee6a4 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -257,7 +257,7 @@ func NewBlock(
 		logger:                          iopts.Logger(),
 		queryLimits:                     opts.QueryLimits(),
 		docsLimit:                       opts.QueryLimits().DocsLimit(),
-		querySegmentsWorkers:            opts.QueryBlockSegmentsWorkerPool(),
+		querySegmentsWorkers:            opts.QueryBlockSegmentWorkerPool(),
 	}
 	b.newFieldsAndTermsIteratorFn = newFieldsAndTermsIterator
 

From 9d61cb25ada9a06834110e3c1dacc289ff1e1022 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 11 Nov 2020 22:57:17 -0500
Subject: [PATCH 028/106] Wire up new series index backoff to runtime options

---
 src/dbnode/storage/index.go              |  3 +-
 src/dbnode/storage/index_insert_queue.go | 38 +++++++++++++++++-------
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index f00f1e84d7..524aef438f 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -309,6 +309,7 @@ func newNamespaceIndexWithOptions(
 			"namespace": nsMD.ID().String(),
 		})
 	instrumentOpts = instrumentOpts.SetMetricsScope(scope)
+	storageOpts := newIndexOpts.opts.SetInstrumentOptions(instrumentOpts)
 	indexOpts = indexOpts.SetInstrumentOptions(instrumentOpts)
 
 	nowFn := indexOpts.ClockOptions().NowFn()
@@ -392,7 +393,7 @@ func newNamespaceIndexWithOptions(
 	idx.forwardIndexDice = dice
 
 	// allocate indexing queue and start it up.
-	queue := newIndexQueueFn(idx.writeBatches, nsMD, nowFn, scope)
+	queue := newIndexQueueFn(idx.writeBatches, nsMD, storageOpts)
 	if err := queue.Start(); err != nil {
 		return nil, err
 	}
diff --git a/src/dbnode/storage/index_insert_queue.go b/src/dbnode/storage/index_insert_queue.go
index b168b99879..47798db952 100644
--- a/src/dbnode/storage/index_insert_queue.go
+++ b/src/dbnode/storage/index_insert_queue.go
@@ -27,9 +27,11 @@ import (
 	"time"
 
 	"github.com/m3db/m3/src/dbnode/namespace"
+	"github.com/m3db/m3/src/dbnode/runtime"
 	"github.com/m3db/m3/src/dbnode/storage/index"
 	"github.com/m3db/m3/src/dbnode/ts/writes"
 	"github.com/m3db/m3/src/x/clock"
+	xresource "github.com/m3db/m3/src/x/resource"
 	xsync "github.com/m3db/m3/src/x/sync"
 
 	"github.com/uber-go/tally"
@@ -47,7 +49,6 @@ const (
 	nsIndexInsertQueueStateOpen
 	nsIndexInsertQueueStateClosed
 
-	// TODO(prateek): runtime options for this stuff
 	defaultIndexBatchBackoff = 2 * time.Millisecond
 
 	indexResetAllInsertsEvery = 3 * time.Minute
@@ -66,11 +67,12 @@ type nsIndexInsertQueue struct {
 	// active batch pending execution
 	currBatch *nsIndexInsertBatch
 
-	indexBatchFn nsIndexInsertBatchFn
-	nowFn        clock.NowFn
-	sleepFn      func(time.Duration)
-	notifyInsert chan struct{}
-	closeCh      chan struct{}
+	indexBatchFn            nsIndexInsertBatchFn
+	nowFn                   clock.NowFn
+	sleepFn                 func(time.Duration)
+	notifyInsert            chan struct{}
+	closeCh                 chan struct{}
+	runtimeOptsListenCloser xresource.SimpleCloser
 
 	scope tally.Scope
 
@@ -78,7 +80,10 @@ type nsIndexInsertQueue struct {
 }
 
 type newNamespaceIndexInsertQueueFn func(
-	nsIndexInsertBatchFn, namespace.Metadata, clock.NowFn, tally.Scope) namespaceIndexInsertQueue
+	nsIndexInsertBatchFn,
+	namespace.Metadata,
+	Options,
+) namespaceIndexInsertQueue
 
 // newNamespaceIndexInsertQueue returns a new index insert queue.
 // Note: No limit appears on the index insert queue since any items making
@@ -92,15 +97,14 @@ type newNamespaceIndexInsertQueueFn func(
 func newNamespaceIndexInsertQueue(
 	indexBatchFn nsIndexInsertBatchFn,
 	namespaceMetadata namespace.Metadata,
-	nowFn clock.NowFn,
-	scope tally.Scope,
+	opts Options,
 ) namespaceIndexInsertQueue {
-	subscope := scope.SubScope("insert-queue")
+	subscope := opts.InstrumentOptions().MetricsScope().SubScope("insert-queue")
 	q := &nsIndexInsertQueue{
 		namespaceMetadata: namespaceMetadata,
 		indexBatchBackoff: defaultIndexBatchBackoff,
 		indexBatchFn:      indexBatchFn,
-		nowFn:             nowFn,
+		nowFn:             opts.ClockOptions().NowFn(),
 		sleepFn:           time.Sleep,
 		// NB(r): Use 2 * num cores so that each CPU insert queue which
 		// is 1 per num CPU core can always enqueue a notification without
@@ -110,10 +114,21 @@ func newNamespaceIndexInsertQueue(
 		scope:        subscope,
 		metrics:      newNamespaceIndexInsertQueueMetrics(subscope),
 	}
+	// Create new batch.
 	q.currBatch = q.newBatch(newBatchOptions{instrumented: true})
+	// Register runtime options manager (which will call SetRuntimeOptions
+	// immediately).
+	runtimeOptsMgr := opts.RuntimeOptionsManager()
+	q.runtimeOptsListenCloser = runtimeOptsMgr.RegisterListener(q)
 	return q
 }
 
+func (q *nsIndexInsertQueue) SetRuntimeOptions(value runtime.Options) {
+	q.Lock()
+	q.indexBatchBackoff = value.WriteNewSeriesBackoffDuration()
+	q.Unlock()
+}
+
 type newBatchOptions struct {
 	instrumented bool
 }
@@ -254,6 +269,7 @@ func (q *nsIndexInsertQueue) Stop() error {
 	}
 
 	q.state = nsIndexInsertQueueStateClosed
+	q.runtimeOptsListenCloser.Close()
 	q.Unlock()
 
 	// Final flush

From dd38ead49a6f51747ae75231dfe5d400c5948883 Mon Sep 17 00:00:00 2001
From: arnikola <artem@chronosphere.io>
Date: Wed, 11 Nov 2020 08:23:53 -0500
Subject: [PATCH 029/106] [dbnode] Refactor wide query path (#2826)

---
 .golangci.yml                                 |  14 +-
 Makefile                                      |   7 +-
 src/dbnode/encoding/types.go                  |   8 +-
 src/dbnode/generated/mocks/generate.go        |   3 +-
 src/dbnode/integration/wide_query_test.go     | 114 +---
 src/dbnode/persist/fs/fs_mock.go              | 405 +++++++-------
 src/dbnode/persist/fs/msgpack/decoder.go      |  37 +-
 src/dbnode/persist/fs/msgpack/decoder_test.go |  26 +-
 .../persist/fs/msgpack/roundtrip_test.go      |  85 +--
 src/dbnode/persist/fs/retriever.go            | 493 ++++++++++--------
 src/dbnode/persist/fs/retriever_test.go       |   4 -
 src/dbnode/persist/fs/seek.go                 | 135 +----
 src/dbnode/persist/fs/types.go                | 114 ++--
 .../wide/entry_checksum_mismatch_checker.go   | 280 ----------
 ...try_checksum_mismatch_checker_prop_test.go | 463 ----------------
 .../entry_checksum_mismatch_checker_test.go   | 375 -------------
 .../wide/index_checksum_block_batch_reader.go |  79 ---
 .../index_checksum_block_batch_reader_test.go |  49 --
 src/dbnode/persist/fs/wide/options.go         |  94 ----
 src/dbnode/persist/fs/wide/options_test.go    |  45 --
 src/dbnode/persist/fs/wide/types.go           | 117 -----
 src/dbnode/persist/fs/wide/wide_mock.go       | 147 ------
 src/dbnode/persist/schema/types.go            |   6 +-
 src/dbnode/storage/block/block_mock.go        | 107 ++--
 src/dbnode/storage/block/retriever_manager.go |  18 +-
 src/dbnode/storage/block/types.go             |  63 +--
 src/dbnode/storage/database.go                | 123 +----
 src/dbnode/storage/database_test.go           |  68 +--
 src/dbnode/storage/index/query_options.go     |  13 +-
 .../storage/index/query_options_test.go       |   8 +-
 src/dbnode/storage/index/types.go             |   2 +-
 .../storage/index/wide_query_results_test.go  |  14 +-
 .../storage/index_queue_forward_write_test.go |   9 +-
 src/dbnode/storage/index_queue_test.go        |  14 +-
 src/dbnode/storage/namespace.go               |  27 +-
 src/dbnode/storage/options.go                 |  11 +
 src/dbnode/storage/series/reader.go           | 388 +++++++-------
 src/dbnode/storage/series/reader_test.go      |  45 +-
 src/dbnode/storage/series/series.go           |  27 +-
 src/dbnode/storage/series/series_mock.go      |  59 +--
 src/dbnode/storage/series/types.go            |  16 +-
 src/dbnode/storage/shard.go                   |  39 +-
 src/dbnode/storage/shard_test.go              |  96 +---
 src/dbnode/storage/storage_mock.go            |  97 +---
 src/dbnode/storage/types.go                   |  47 +-
 src/dbnode/tracepoint/tracepoint.go           |   4 +-
 .../xio/{index_checksum.go => wide_entry.go}  |  33 +-
 src/x/checked/debug.go                        |   5 +
 48 files changed, 1144 insertions(+), 3289 deletions(-)
 delete mode 100644 src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker.go
 delete mode 100644 src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker_prop_test.go
 delete mode 100644 src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker_test.go
 delete mode 100644 src/dbnode/persist/fs/wide/index_checksum_block_batch_reader.go
 delete mode 100644 src/dbnode/persist/fs/wide/index_checksum_block_batch_reader_test.go
 delete mode 100644 src/dbnode/persist/fs/wide/options.go
 delete mode 100644 src/dbnode/persist/fs/wide/options_test.go
 delete mode 100644 src/dbnode/persist/fs/wide/types.go
 delete mode 100644 src/dbnode/persist/fs/wide/wide_mock.go
 rename src/dbnode/x/xio/{index_checksum.go => wide_entry.go} (72%)

diff --git a/.golangci.yml b/.golangci.yml
index 8ecd89e592..5c83931b61 100644
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -108,6 +108,8 @@ linters-settings:
     locale: US
     ignore-words:
       - someword
+  exhaustive:
+    default-signifies-exhaustive: true
   lll:
     # max line length, lines longer will be reported. Default is 120.
     # '\t' is counted as 1 character by default, and can be changed with the tab-width option
@@ -172,7 +174,6 @@ linters:
     - goconst
     - gocritic
     - gocyclo
-    - godox
     - goimports
     - golint
     - gosimple
@@ -211,10 +212,17 @@ linters:
     - exhaustivestruct
     # We allow cuddling assignment following conditions because there are valid 
     # logical groupings for this use-case (e.g. when evaluating config values).
-    - wsl
+    - wsl 
+    # Wrapcheck can cause errors until all callsites checking explicit error
+    # types like io.EOF are converted to use errors.Is instead. Re-enable this
+    # linter once all error checks are upgraded.
+    - wrapcheck
+    # godox prevents using TODOs or FIXMEs which can be useful for demarkation
+    # of future work.
+    - godox 
     # New line required before return would require a large fraction of the
     # code base to need updating, it's not worth the perceived benefit.
-    - nlreturn
+    - nlreturn 
   disable-all: false
   presets:
     # bodyclose, errcheck, gosec, govet, scopelint, staticcheck, typecheck
diff --git a/Makefile b/Makefile
index 92943d6670..d54b60a32f 100644
--- a/Makefile
+++ b/Makefile
@@ -165,7 +165,7 @@ tools-linux-amd64:
 	$(LINUX_AMD64_ENV) make tools
 
 .PHONY: all
-all: lint test-ci-unit test-ci-integration services tools
+all: test-ci-unit test-ci-integration services tools
 	@echo Made all successfully
 
 .PHONY: install-tools
@@ -256,7 +256,7 @@ SUBDIR_TARGETS := \
 	asset-gen       \
 	genny-gen       \
 	license-gen     \
-	all-gen			\
+	all-gen         \
 	lint
 
 .PHONY: test-ci-unit
@@ -384,6 +384,7 @@ endef
 
 # generate targets across SUBDIRS for each SUBDIR_TARGET. i.e. generate rules
 # which allow `make all-gen` to invoke `make all-gen-dbnode all-gen-coordinator ...`
+# NB: we skip lint explicity as it runs as a separate CI step.
 $(foreach SUBDIR_TARGET, $(SUBDIR_TARGETS), $(eval $(SUBDIR_TARGET_RULE)))
 
 # Builds the single kube bundle from individual manifest files.
@@ -401,7 +402,7 @@ go-mod-tidy:
 .PHONY: all-gen
 all-gen: \
 	install-tools \
-	$(foreach SUBDIR_TARGET, $(SUBDIR_TARGETS), $(SUBDIR_TARGET)) \
+	$(foreach SUBDIR_TARGET, $(filter-out lint all-gen,$(SUBDIR_TARGETS)), $(SUBDIR_TARGET)) \
 	kube-gen-all \
 	go-mod-tidy
 
diff --git a/src/dbnode/encoding/types.go b/src/dbnode/encoding/types.go
index 93af1b4746..230925b340 100644
--- a/src/dbnode/encoding/types.go
+++ b/src/dbnode/encoding/types.go
@@ -88,7 +88,7 @@ type Encoder interface {
 	DiscardReset(t time.Time, capacity int, schema namespace.SchemaDescr) ts.Segment
 }
 
-// NewEncoderFn creates a new encoder
+// NewEncoderFn creates a new encoder.
 type NewEncoderFn func(start time.Time, bytes []byte) Encoder
 
 // Options represents different options for encoding time as well as markers.
@@ -178,7 +178,7 @@ type Iterator interface {
 	// object as it may get invalidated when the iterator calls Next().
 	Current() (ts.Datapoint, xtime.Unit, ts.Annotation)
 
-	// Err returns the error encountered
+	// Err returns the error encountered.
 	Err() error
 
 	// Close closes the iterator and if pooled will return to the pool.
@@ -367,9 +367,9 @@ type IStream interface {
 
 // OStream encapsulates a writable stream.
 type OStream interface {
-	// Len returns the length of the OStream
+	// Len returns the length of the OStream.
 	Len() int
-	// Empty returns whether the OStream is empty
+	// Empty returns whether the OStream is empty.
 	Empty() bool
 
 	// WriteBit writes the last bit of v.
diff --git a/src/dbnode/generated/mocks/generate.go b/src/dbnode/generated/mocks/generate.go
index 8da07ecdc5..3869d17577 100644
--- a/src/dbnode/generated/mocks/generate.go
+++ b/src/dbnode/generated/mocks/generate.go
@@ -20,8 +20,7 @@
 
 // mockgen rules for generating mocks for exported interfaces (reflection mode)
 
-//go:generate sh -c "mockgen -package=wide $PACKAGE/src/dbnode/persist/fs/wide EntryChecksumMismatchChecker,StreamedMismatch | genclean -pkg $PACKAGE/src/dbnode/persist/fs/wide -out $GOPATH/src/$PACKAGE/src/dbnode/persist/fs/wide/wide_mock.go"
-//go:generate sh -c "mockgen -package=fs $PACKAGE/src/dbnode/persist/fs DataFileSetWriter,DataFileSetReader,DataFileSetSeeker,IndexFileSetWriter,IndexFileSetReader,IndexSegmentFileSetWriter,IndexSegmentFileSet,IndexSegmentFile,SnapshotMetadataFileWriter,DataFileSetSeekerManager,ConcurrentDataFileSetSeeker,MergeWith,CrossBlockReader,CrossBlockIterator,StreamingWriter | genclean -pkg $PACKAGE/src/dbnode/persist/fs -out $GOPATH/src/$PACKAGE/src/dbnode/persist/fs/fs_mock.go"
+//go:generate sh -c "mockgen -package=fs $PACKAGE/src/dbnode/persist/fs CrossBlockReader,CrossBlockIterator,DataFileSetWriter,DataFileSetReader,DataFileSetSeeker,IndexFileSetWriter,IndexFileSetReader,IndexSegmentFileSetWriter,IndexSegmentFileSet,IndexSegmentFile,SnapshotMetadataFileWriter,DataFileSetSeekerManager,ConcurrentDataFileSetSeeker,MergeWith,StreamingWriter | genclean -pkg $PACKAGE/src/dbnode/persist/fs -out $GOPATH/src/$PACKAGE/src/dbnode/persist/fs/fs_mock.go"
 //go:generate sh -c "mockgen -package=xio $PACKAGE/src/dbnode/x/xio SegmentReader,SegmentReaderPool | genclean -pkg $PACKAGE/src/dbnode/x/xio -out $GOPATH/src/$PACKAGE/src/dbnode/x/xio/io_mock.go"
 //go:generate sh -c "mockgen -package=digest -destination=$GOPATH/src/$PACKAGE/src/dbnode/digest/digest_mock.go $PACKAGE/src/dbnode/digest ReaderWithDigest"
 //go:generate sh -c "mockgen -package=series $PACKAGE/src/dbnode/storage/series DatabaseSeries,QueryableBlockRetriever | genclean -pkg $PACKAGE/src/dbnode/storage/series -out $GOPATH/src/$PACKAGE/src/dbnode/storage/series/series_mock.go"
diff --git a/src/dbnode/integration/wide_query_test.go b/src/dbnode/integration/wide_query_test.go
index ff95fb918a..17192d9841 100644
--- a/src/dbnode/integration/wide_query_test.go
+++ b/src/dbnode/integration/wide_query_test.go
@@ -1,4 +1,4 @@
-// +build integration
+// +build big
 //
 // Copyright (c) 2020 Uber Technologies, Inc.
 //
@@ -23,7 +23,6 @@
 package integration
 
 import (
-	"bytes"
 	"fmt"
 	"io/ioutil"
 	"runtime"
@@ -32,10 +31,8 @@ import (
 	"testing"
 	"time"
 
-	"github.com/m3db/m3/src/dbnode/encoding/m3tsz"
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist/fs"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/persist/schema"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/storage"
@@ -60,9 +57,9 @@ const (
 	wideTagValFmt = "val-%05d"
 )
 
-type shardedIndexChecksum struct {
-	shard     uint32
-	checksums []schema.IndexChecksum
+type shardedWideEntry struct {
+	shard   uint32
+	entries []schema.WideEntry
 }
 
 // buildExpectedChecksumsByShard sorts the given IDs into ascending shard order,
@@ -72,10 +69,10 @@ func buildExpectedChecksumsByShard(
 	allowedShards []uint32,
 	shardSet sharding.ShardSet,
 	batchSize int,
-) []schema.IndexChecksum {
-	shardedChecksums := make([]shardedIndexChecksum, 0, len(ids))
+) []schema.WideEntry {
+	shardedEntries := make([]shardedWideEntry, 0, len(ids))
 	for i, id := range ids {
-		checksum := schema.IndexChecksum{
+		entry := schema.WideEntry{
 			IndexEntry: schema.IndexEntry{
 				ID: []byte(id),
 			},
@@ -98,13 +95,13 @@ func buildExpectedChecksumsByShard(
 		}
 
 		found := false
-		for idx, sharded := range shardedChecksums {
+		for idx, sharded := range shardedEntries {
 			if shard != sharded.shard {
 				continue
 			}
 
 			found = true
-			shardedChecksums[idx].checksums = append(sharded.checksums, checksum)
+			shardedEntries[idx].entries = append(sharded.entries, entry)
 			break
 		}
 
@@ -112,36 +109,22 @@ func buildExpectedChecksumsByShard(
 			continue
 		}
 
-		shardedChecksums = append(shardedChecksums, shardedIndexChecksum{
-			shard:     shard,
-			checksums: []schema.IndexChecksum{checksum},
+		shardedEntries = append(shardedEntries, shardedWideEntry{
+			shard:   shard,
+			entries: []schema.WideEntry{entry},
 		})
 	}
 
-	sort.Slice(shardedChecksums, func(i, j int) bool {
-		return shardedChecksums[i].shard < shardedChecksums[j].shard
+	sort.Slice(shardedEntries, func(i, j int) bool {
+		return shardedEntries[i].shard < shardedEntries[j].shard
 	})
 
-	var checksums []schema.IndexChecksum
-	for _, sharded := range shardedChecksums {
-		checksums = append(checksums, sharded.checksums...)
+	var entries []schema.WideEntry
+	for _, sharded := range shardedEntries {
+		entries = append(entries, sharded.entries...)
 	}
 
-	// NB: IDs should only be included for documents that conclude a batch.
-	l := len(checksums)
-	if l == 0 {
-		return checksums
-	}
-
-	// NB: only look at the last `l-1` elements, as the last element should
-	// always have its ID.
-	for i, checksum := range checksums[:l-1] {
-		if (i+1)%batchSize != 0 {
-			checksums[i].ID = checksum.ID[:0]
-		}
-	}
-
-	return checksums
+	return entries
 }
 
 func assertTags(
@@ -164,28 +147,6 @@ func assertTags(
 	require.NoError(t, decoder.Err())
 }
 
-func assertData(
-	t *testing.T,
-	ex int64,
-	exTime time.Time,
-	mismatch wide.ReadMismatch,
-) {
-	mismatch.Data.IncRef()
-	mismatchData := mismatch.Data.Bytes()
-	mismatch.Data.DecRef()
-
-	decoder := m3tsz.NewDecoder(true, nil)
-	dataReader := bytes.NewBuffer(mismatchData)
-	it := decoder.Decode(dataReader)
-	assert.NoError(t, it.Err())
-	assert.True(t, it.Next())
-	ts, _, _ := it.Current()
-	assert.True(t, ts.Timestamp.Equal(exTime))
-	assert.Equal(t, float64(ex), ts.Value)
-	assert.False(t, it.Next())
-	assert.NoError(t, it.Err())
-}
-
 func TestWideFetch(t *testing.T) {
 	if testing.Short() {
 		t.SkipNow() // Just skip if we're doing a short run
@@ -315,10 +276,6 @@ func TestWideFetch(t *testing.T) {
 	decoder := tagDecoderPool.Get()
 	defer decoder.Close()
 
-	wideOpts := wide.NewOptions().
-		SetDecodingOptions(decOpts).
-		SetBatchSize(batchSize)
-
 	for _, tt := range shardFilterTests {
 		t.Run(tt.name, func(t *testing.T) {
 			ctx := context.NewContext()
@@ -333,32 +290,6 @@ func TestWideFetch(t *testing.T) {
 				assert.Equal(t, expected[i].MetadataChecksum, checksum.MetadataChecksum)
 				require.Equal(t, string(expected[i].ID), checksum.ID.String())
 				assertTags(t, checksum.EncodedTags, decoder, checksum.MetadataChecksum)
-				checksum.Finalize()
-			}
-
-			ctx.Close()
-		})
-
-		t.Run(fmt.Sprintf("%s_checksum_mismatch", tt.name), func(t *testing.T) {
-			ctx := context.NewContext()
-			// NB: empty index checksum blocks.
-			inCh := make(chan wide.IndexChecksumBlockBatch)
-			batchReader := wide.NewIndexChecksumBlockBatchReader(inCh)
-			close(inCh)
-
-			checker := wide.NewEntryChecksumMismatchChecker(batchReader, wideOpts)
-			mismatches, err := testSetup.DB().ReadMismatches(ctx, nsMetadata.ID(), query,
-				checker, now, tt.shards, iterOpts)
-			require.NoError(t, err)
-
-			expected := buildExpectedChecksumsByShard(ids, tt.shards,
-				testSetup.ShardSet(), batchSize)
-			require.Equal(t, len(expected), len(mismatches))
-			for i, mismatch := range mismatches {
-				assert.Equal(t, expected[i].MetadataChecksum, mismatch.MetadataChecksum)
-				assertTags(t, mismatch.EncodedTags, decoder, mismatch.MetadataChecksum)
-				assertData(t, expected[i].MetadataChecksum, now, mismatch)
-				mismatch.Finalize()
 			}
 
 			ctx.Close()
@@ -394,15 +325,14 @@ func TestWideFetch(t *testing.T) {
 				now, tt.shards, iterOpts)
 			require.NoError(t, err)
 
-			if !tt.expected {
-				assert.Equal(t, 0, len(chk))
-			} else {
+			if tt.expected {
 				require.Equal(t, 1, len(chk))
 				checksum := chk[0]
 				assert.Equal(t, int64(1), checksum.MetadataChecksum)
 				assert.Equal(t, exactID, checksum.ID.String())
 				assertTags(t, checksum.EncodedTags, decoder, checksum.MetadataChecksum)
-				checksum.Finalize()
+			} else {
+				assert.Equal(t, 0, len(chk))
 			}
 
 			ctx.Close()
@@ -443,8 +373,6 @@ func TestWideFetch(t *testing.T) {
 							expected[i].MetadataChecksum, checksum.MetadataChecksum)
 						break
 					}
-
-					checksum.Finalize()
 				}
 
 				ctx.Close()
diff --git a/src/dbnode/persist/fs/fs_mock.go b/src/dbnode/persist/fs/fs_mock.go
index 11da9e4812..861c2659a1 100644
--- a/src/dbnode/persist/fs/fs_mock.go
+++ b/src/dbnode/persist/fs/fs_mock.go
@@ -1,5 +1,5 @@
 // Code generated by MockGen. DO NOT EDIT.
-// Source: github.com/m3db/m3/src/dbnode/persist/fs (interfaces: DataFileSetWriter,DataFileSetReader,DataFileSetSeeker,IndexFileSetWriter,IndexFileSetReader,IndexSegmentFileSetWriter,IndexSegmentFileSet,IndexSegmentFile,SnapshotMetadataFileWriter,DataFileSetSeekerManager,ConcurrentDataFileSetSeeker,MergeWith,CrossBlockReader,CrossBlockIterator,StreamingWriter)
+// Source: github.com/m3db/m3/src/dbnode/persist/fs (interfaces: CrossBlockReader,CrossBlockIterator,DataFileSetWriter,DataFileSetReader,DataFileSetSeeker,IndexFileSetWriter,IndexFileSetReader,IndexSegmentFileSetWriter,IndexSegmentFileSet,IndexSegmentFile,SnapshotMetadataFileWriter,DataFileSetSeekerManager,ConcurrentDataFileSetSeeker,MergeWith,StreamingWriter)
 
 // Copyright (c) 2020 Uber Technologies, Inc.
 //
@@ -31,7 +31,6 @@ import (
 
 	"github.com/m3db/m3/src/dbnode/namespace"
 	persist "github.com/m3db/m3/src/dbnode/persist"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/ts"
 	"github.com/m3db/m3/src/dbnode/x/xio"
@@ -44,6 +43,178 @@ import (
 	"github.com/golang/mock/gomock"
 )
 
+// MockCrossBlockReader is a mock of CrossBlockReader interface
+type MockCrossBlockReader struct {
+	ctrl     *gomock.Controller
+	recorder *MockCrossBlockReaderMockRecorder
+}
+
+// MockCrossBlockReaderMockRecorder is the mock recorder for MockCrossBlockReader
+type MockCrossBlockReaderMockRecorder struct {
+	mock *MockCrossBlockReader
+}
+
+// NewMockCrossBlockReader creates a new mock instance
+func NewMockCrossBlockReader(ctrl *gomock.Controller) *MockCrossBlockReader {
+	mock := &MockCrossBlockReader{ctrl: ctrl}
+	mock.recorder = &MockCrossBlockReaderMockRecorder{mock}
+	return mock
+}
+
+// EXPECT returns an object that allows the caller to indicate expected use
+func (m *MockCrossBlockReader) EXPECT() *MockCrossBlockReaderMockRecorder {
+	return m.recorder
+}
+
+// Close mocks base method
+func (m *MockCrossBlockReader) Close() error {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Close")
+	ret0, _ := ret[0].(error)
+	return ret0
+}
+
+// Close indicates an expected call of Close
+func (mr *MockCrossBlockReaderMockRecorder) Close() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockCrossBlockReader)(nil).Close))
+}
+
+// Current mocks base method
+func (m *MockCrossBlockReader) Current() (ident.BytesID, ts.EncodedTags, []BlockRecord) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Current")
+	ret0, _ := ret[0].(ident.BytesID)
+	ret1, _ := ret[1].(ts.EncodedTags)
+	ret2, _ := ret[2].([]BlockRecord)
+	return ret0, ret1, ret2
+}
+
+// Current indicates an expected call of Current
+func (mr *MockCrossBlockReaderMockRecorder) Current() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Current", reflect.TypeOf((*MockCrossBlockReader)(nil).Current))
+}
+
+// Err mocks base method
+func (m *MockCrossBlockReader) Err() error {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Err")
+	ret0, _ := ret[0].(error)
+	return ret0
+}
+
+// Err indicates an expected call of Err
+func (mr *MockCrossBlockReaderMockRecorder) Err() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Err", reflect.TypeOf((*MockCrossBlockReader)(nil).Err))
+}
+
+// Next mocks base method
+func (m *MockCrossBlockReader) Next() bool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Next")
+	ret0, _ := ret[0].(bool)
+	return ret0
+}
+
+// Next indicates an expected call of Next
+func (mr *MockCrossBlockReaderMockRecorder) Next() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Next", reflect.TypeOf((*MockCrossBlockReader)(nil).Next))
+}
+
+// MockCrossBlockIterator is a mock of CrossBlockIterator interface
+type MockCrossBlockIterator struct {
+	ctrl     *gomock.Controller
+	recorder *MockCrossBlockIteratorMockRecorder
+}
+
+// MockCrossBlockIteratorMockRecorder is the mock recorder for MockCrossBlockIterator
+type MockCrossBlockIteratorMockRecorder struct {
+	mock *MockCrossBlockIterator
+}
+
+// NewMockCrossBlockIterator creates a new mock instance
+func NewMockCrossBlockIterator(ctrl *gomock.Controller) *MockCrossBlockIterator {
+	mock := &MockCrossBlockIterator{ctrl: ctrl}
+	mock.recorder = &MockCrossBlockIteratorMockRecorder{mock}
+	return mock
+}
+
+// EXPECT returns an object that allows the caller to indicate expected use
+func (m *MockCrossBlockIterator) EXPECT() *MockCrossBlockIteratorMockRecorder {
+	return m.recorder
+}
+
+// Close mocks base method
+func (m *MockCrossBlockIterator) Close() {
+	m.ctrl.T.Helper()
+	m.ctrl.Call(m, "Close")
+}
+
+// Close indicates an expected call of Close
+func (mr *MockCrossBlockIteratorMockRecorder) Close() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockCrossBlockIterator)(nil).Close))
+}
+
+// Current mocks base method
+func (m *MockCrossBlockIterator) Current() (ts.Datapoint, time0.Unit, ts.Annotation) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Current")
+	ret0, _ := ret[0].(ts.Datapoint)
+	ret1, _ := ret[1].(time0.Unit)
+	ret2, _ := ret[2].(ts.Annotation)
+	return ret0, ret1, ret2
+}
+
+// Current indicates an expected call of Current
+func (mr *MockCrossBlockIteratorMockRecorder) Current() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Current", reflect.TypeOf((*MockCrossBlockIterator)(nil).Current))
+}
+
+// Err mocks base method
+func (m *MockCrossBlockIterator) Err() error {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Err")
+	ret0, _ := ret[0].(error)
+	return ret0
+}
+
+// Err indicates an expected call of Err
+func (mr *MockCrossBlockIteratorMockRecorder) Err() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Err", reflect.TypeOf((*MockCrossBlockIterator)(nil).Err))
+}
+
+// Next mocks base method
+func (m *MockCrossBlockIterator) Next() bool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Next")
+	ret0, _ := ret[0].(bool)
+	return ret0
+}
+
+// Next indicates an expected call of Next
+func (mr *MockCrossBlockIteratorMockRecorder) Next() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Next", reflect.TypeOf((*MockCrossBlockIterator)(nil).Next))
+}
+
+// Reset mocks base method
+func (m *MockCrossBlockIterator) Reset(arg0 []BlockRecord) {
+	m.ctrl.T.Helper()
+	m.ctrl.Call(m, "Reset", arg0)
+}
+
+// Reset indicates an expected call of Reset
+func (mr *MockCrossBlockIteratorMockRecorder) Reset(arg0 interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reset", reflect.TypeOf((*MockCrossBlockIterator)(nil).Reset), arg0)
+}
+
 // MockDataFileSetWriter is a mock of DataFileSetWriter interface
 type MockDataFileSetWriter struct {
 	ctrl     *gomock.Controller
@@ -523,34 +694,19 @@ func (mr *MockDataFileSetSeekerMockRecorder) SeekIndexEntry(arg0, arg1 interface
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SeekIndexEntry", reflect.TypeOf((*MockDataFileSetSeeker)(nil).SeekIndexEntry), arg0, arg1)
 }
 
-// SeekIndexEntryToIndexChecksum mocks base method
-func (m *MockDataFileSetSeeker) SeekIndexEntryToIndexChecksum(arg0 ident.ID, arg1 ReusableSeekerResources) (xio.IndexChecksum, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "SeekIndexEntryToIndexChecksum", arg0, arg1)
-	ret0, _ := ret[0].(xio.IndexChecksum)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// SeekIndexEntryToIndexChecksum indicates an expected call of SeekIndexEntryToIndexChecksum
-func (mr *MockDataFileSetSeekerMockRecorder) SeekIndexEntryToIndexChecksum(arg0, arg1 interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SeekIndexEntryToIndexChecksum", reflect.TypeOf((*MockDataFileSetSeeker)(nil).SeekIndexEntryToIndexChecksum), arg0, arg1)
-}
-
-// SeekReadMismatchesByIndexChecksum mocks base method
-func (m *MockDataFileSetSeeker) SeekReadMismatchesByIndexChecksum(arg0 xio.IndexChecksum, arg1 wide.EntryChecksumMismatchChecker, arg2 ReusableSeekerResources) (wide.ReadMismatch, error) {
+// SeekWideEntry mocks base method
+func (m *MockDataFileSetSeeker) SeekWideEntry(arg0 ident.ID, arg1 ReusableSeekerResources) (xio.WideEntry, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "SeekReadMismatchesByIndexChecksum", arg0, arg1, arg2)
-	ret0, _ := ret[0].(wide.ReadMismatch)
+	ret := m.ctrl.Call(m, "SeekWideEntry", arg0, arg1)
+	ret0, _ := ret[0].(xio.WideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// SeekReadMismatchesByIndexChecksum indicates an expected call of SeekReadMismatchesByIndexChecksum
-func (mr *MockDataFileSetSeekerMockRecorder) SeekReadMismatchesByIndexChecksum(arg0, arg1, arg2 interface{}) *gomock.Call {
+// SeekWideEntry indicates an expected call of SeekWideEntry
+func (mr *MockDataFileSetSeekerMockRecorder) SeekWideEntry(arg0, arg1 interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SeekReadMismatchesByIndexChecksum", reflect.TypeOf((*MockDataFileSetSeeker)(nil).SeekReadMismatchesByIndexChecksum), arg0, arg1, arg2)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SeekWideEntry", reflect.TypeOf((*MockDataFileSetSeeker)(nil).SeekWideEntry), arg0, arg1)
 }
 
 // MockIndexFileSetWriter is a mock of IndexFileSetWriter interface
@@ -1274,34 +1430,19 @@ func (mr *MockConcurrentDataFileSetSeekerMockRecorder) SeekIndexEntry(arg0, arg1
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SeekIndexEntry", reflect.TypeOf((*MockConcurrentDataFileSetSeeker)(nil).SeekIndexEntry), arg0, arg1)
 }
 
-// SeekIndexEntryToIndexChecksum mocks base method
-func (m *MockConcurrentDataFileSetSeeker) SeekIndexEntryToIndexChecksum(arg0 ident.ID, arg1 ReusableSeekerResources) (xio.IndexChecksum, error) {
+// SeekWideEntry mocks base method
+func (m *MockConcurrentDataFileSetSeeker) SeekWideEntry(arg0 ident.ID, arg1 ReusableSeekerResources) (xio.WideEntry, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "SeekIndexEntryToIndexChecksum", arg0, arg1)
-	ret0, _ := ret[0].(xio.IndexChecksum)
+	ret := m.ctrl.Call(m, "SeekWideEntry", arg0, arg1)
+	ret0, _ := ret[0].(xio.WideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// SeekIndexEntryToIndexChecksum indicates an expected call of SeekIndexEntryToIndexChecksum
-func (mr *MockConcurrentDataFileSetSeekerMockRecorder) SeekIndexEntryToIndexChecksum(arg0, arg1 interface{}) *gomock.Call {
+// SeekWideEntry indicates an expected call of SeekWideEntry
+func (mr *MockConcurrentDataFileSetSeekerMockRecorder) SeekWideEntry(arg0, arg1 interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SeekIndexEntryToIndexChecksum", reflect.TypeOf((*MockConcurrentDataFileSetSeeker)(nil).SeekIndexEntryToIndexChecksum), arg0, arg1)
-}
-
-// SeekReadMismatchesByIndexChecksum mocks base method
-func (m *MockConcurrentDataFileSetSeeker) SeekReadMismatchesByIndexChecksum(arg0 xio.IndexChecksum, arg1 wide.EntryChecksumMismatchChecker, arg2 ReusableSeekerResources) (wide.ReadMismatch, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "SeekReadMismatchesByIndexChecksum", arg0, arg1, arg2)
-	ret0, _ := ret[0].(wide.ReadMismatch)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// SeekReadMismatchesByIndexChecksum indicates an expected call of SeekReadMismatchesByIndexChecksum
-func (mr *MockConcurrentDataFileSetSeekerMockRecorder) SeekReadMismatchesByIndexChecksum(arg0, arg1, arg2 interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SeekReadMismatchesByIndexChecksum", reflect.TypeOf((*MockConcurrentDataFileSetSeeker)(nil).SeekReadMismatchesByIndexChecksum), arg0, arg1, arg2)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SeekWideEntry", reflect.TypeOf((*MockConcurrentDataFileSetSeeker)(nil).SeekWideEntry), arg0, arg1)
 }
 
 // MockMergeWith is a mock of MergeWith interface
@@ -1357,178 +1498,6 @@ func (mr *MockMergeWithMockRecorder) Read(arg0, arg1, arg2, arg3 interface{}) *g
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Read", reflect.TypeOf((*MockMergeWith)(nil).Read), arg0, arg1, arg2, arg3)
 }
 
-// MockCrossBlockReader is a mock of CrossBlockReader interface
-type MockCrossBlockReader struct {
-	ctrl     *gomock.Controller
-	recorder *MockCrossBlockReaderMockRecorder
-}
-
-// MockCrossBlockReaderMockRecorder is the mock recorder for MockCrossBlockReader
-type MockCrossBlockReaderMockRecorder struct {
-	mock *MockCrossBlockReader
-}
-
-// NewMockCrossBlockReader creates a new mock instance
-func NewMockCrossBlockReader(ctrl *gomock.Controller) *MockCrossBlockReader {
-	mock := &MockCrossBlockReader{ctrl: ctrl}
-	mock.recorder = &MockCrossBlockReaderMockRecorder{mock}
-	return mock
-}
-
-// EXPECT returns an object that allows the caller to indicate expected use
-func (m *MockCrossBlockReader) EXPECT() *MockCrossBlockReaderMockRecorder {
-	return m.recorder
-}
-
-// Close mocks base method
-func (m *MockCrossBlockReader) Close() error {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Close")
-	ret0, _ := ret[0].(error)
-	return ret0
-}
-
-// Close indicates an expected call of Close
-func (mr *MockCrossBlockReaderMockRecorder) Close() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockCrossBlockReader)(nil).Close))
-}
-
-// Current mocks base method
-func (m *MockCrossBlockReader) Current() (ident.BytesID, ts.EncodedTags, []BlockRecord) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Current")
-	ret0, _ := ret[0].(ident.BytesID)
-	ret1, _ := ret[1].(ts.EncodedTags)
-	ret2, _ := ret[2].([]BlockRecord)
-	return ret0, ret1, ret2
-}
-
-// Current indicates an expected call of Current
-func (mr *MockCrossBlockReaderMockRecorder) Current() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Current", reflect.TypeOf((*MockCrossBlockReader)(nil).Current))
-}
-
-// Err mocks base method
-func (m *MockCrossBlockReader) Err() error {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Err")
-	ret0, _ := ret[0].(error)
-	return ret0
-}
-
-// Err indicates an expected call of Err
-func (mr *MockCrossBlockReaderMockRecorder) Err() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Err", reflect.TypeOf((*MockCrossBlockReader)(nil).Err))
-}
-
-// Next mocks base method
-func (m *MockCrossBlockReader) Next() bool {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Next")
-	ret0, _ := ret[0].(bool)
-	return ret0
-}
-
-// Next indicates an expected call of Next
-func (mr *MockCrossBlockReaderMockRecorder) Next() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Next", reflect.TypeOf((*MockCrossBlockReader)(nil).Next))
-}
-
-// MockCrossBlockIterator is a mock of CrossBlockIterator interface
-type MockCrossBlockIterator struct {
-	ctrl     *gomock.Controller
-	recorder *MockCrossBlockIteratorMockRecorder
-}
-
-// MockCrossBlockIteratorMockRecorder is the mock recorder for MockCrossBlockIterator
-type MockCrossBlockIteratorMockRecorder struct {
-	mock *MockCrossBlockIterator
-}
-
-// NewMockCrossBlockIterator creates a new mock instance
-func NewMockCrossBlockIterator(ctrl *gomock.Controller) *MockCrossBlockIterator {
-	mock := &MockCrossBlockIterator{ctrl: ctrl}
-	mock.recorder = &MockCrossBlockIteratorMockRecorder{mock}
-	return mock
-}
-
-// EXPECT returns an object that allows the caller to indicate expected use
-func (m *MockCrossBlockIterator) EXPECT() *MockCrossBlockIteratorMockRecorder {
-	return m.recorder
-}
-
-// Close mocks base method
-func (m *MockCrossBlockIterator) Close() {
-	m.ctrl.T.Helper()
-	m.ctrl.Call(m, "Close")
-}
-
-// Close indicates an expected call of Close
-func (mr *MockCrossBlockIteratorMockRecorder) Close() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockCrossBlockIterator)(nil).Close))
-}
-
-// Current mocks base method
-func (m *MockCrossBlockIterator) Current() (ts.Datapoint, time0.Unit, ts.Annotation) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Current")
-	ret0, _ := ret[0].(ts.Datapoint)
-	ret1, _ := ret[1].(time0.Unit)
-	ret2, _ := ret[2].(ts.Annotation)
-	return ret0, ret1, ret2
-}
-
-// Current indicates an expected call of Current
-func (mr *MockCrossBlockIteratorMockRecorder) Current() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Current", reflect.TypeOf((*MockCrossBlockIterator)(nil).Current))
-}
-
-// Err mocks base method
-func (m *MockCrossBlockIterator) Err() error {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Err")
-	ret0, _ := ret[0].(error)
-	return ret0
-}
-
-// Err indicates an expected call of Err
-func (mr *MockCrossBlockIteratorMockRecorder) Err() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Err", reflect.TypeOf((*MockCrossBlockIterator)(nil).Err))
-}
-
-// Next mocks base method
-func (m *MockCrossBlockIterator) Next() bool {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Next")
-	ret0, _ := ret[0].(bool)
-	return ret0
-}
-
-// Next indicates an expected call of Next
-func (mr *MockCrossBlockIteratorMockRecorder) Next() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Next", reflect.TypeOf((*MockCrossBlockIterator)(nil).Next))
-}
-
-// Reset mocks base method
-func (m *MockCrossBlockIterator) Reset(arg0 []BlockRecord) {
-	m.ctrl.T.Helper()
-	m.ctrl.Call(m, "Reset", arg0)
-}
-
-// Reset indicates an expected call of Reset
-func (mr *MockCrossBlockIteratorMockRecorder) Reset(arg0 interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Reset", reflect.TypeOf((*MockCrossBlockIterator)(nil).Reset), arg0)
-}
-
 // MockStreamingWriter is a mock of StreamingWriter interface
 type MockStreamingWriter struct {
 	ctrl     *gomock.Controller
diff --git a/src/dbnode/persist/fs/msgpack/decoder.go b/src/dbnode/persist/fs/msgpack/decoder.go
index e2126a3873..532a3c66b3 100644
--- a/src/dbnode/persist/fs/msgpack/decoder.go
+++ b/src/dbnode/persist/fs/msgpack/decoder.go
@@ -38,6 +38,7 @@ var (
 	emptyIndexSummariesInfo     schema.IndexSummariesInfo
 	emptyIndexBloomFilterInfo   schema.IndexBloomFilterInfo
 	emptyIndexEntry             schema.IndexEntry
+	emptyWideEntry              schema.WideEntry
 	emptyIndexSummary           schema.IndexSummary
 	emptyIndexSummaryToken      IndexSummaryToken
 	emptyLogInfo                schema.LogInfo
@@ -50,12 +51,12 @@ var (
 	errorIndexEntryChecksumMismatch                = errors.New("decode index entry encountered checksum mismatch")
 )
 
-// IndexChecksumLookupStatus is the status for an index checksum lookup.
-type IndexChecksumLookupStatus byte
+// WideEntryLookupStatus is the status for a wide entry lookup.
+type WideEntryLookupStatus byte
 
 const (
 	// ErrorLookupStatus indicates an error state.
-	ErrorLookupStatus IndexChecksumLookupStatus = iota
+	ErrorLookupStatus WideEntryLookupStatus = iota
 	// MatchedLookupStatus indicates the current entry ID matches the requested ID.
 	MatchedLookupStatus
 	// MismatchLookupStatus indicates the current entry ID preceeds the requested ID.
@@ -151,24 +152,24 @@ func (dec *Decoder) DecodeIndexEntry(bytesPool pool.BytesPool) (schema.IndexEntr
 	return indexEntry, nil
 }
 
-// DecodeIndexEntryToIndexChecksum decodes an index entry into a minimal index entry.
-func (dec *Decoder) DecodeIndexEntryToIndexChecksum(
+// DecodeToWideEntry decodes an index entry into a wide entry.
+func (dec *Decoder) DecodeToWideEntry(
 	compareID []byte,
 	bytesPool pool.BytesPool,
-) (schema.IndexChecksum, IndexChecksumLookupStatus, error) {
+) (schema.WideEntry, WideEntryLookupStatus, error) {
 	if dec.err != nil {
-		return schema.IndexChecksum{}, NotFoundLookupStatus, dec.err
+		return emptyWideEntry, NotFoundLookupStatus, dec.err
 	}
 	dec.readerWithDigest.setDigestReaderEnabled(true)
 	_, numFieldsToSkip := dec.decodeRootObject(indexEntryVersion, indexEntryType)
-	indexWithMetaChecksum, status := dec.decodeIndexChecksum(compareID, bytesPool)
+	entry, status := dec.decodeWideEntry(compareID, bytesPool)
 	dec.readerWithDigest.setDigestReaderEnabled(false)
 	dec.skip(numFieldsToSkip)
 	if status != MatchedLookupStatus || dec.err != nil {
-		return schema.IndexChecksum{}, status, dec.err
+		return emptyWideEntry, status, dec.err
 	}
 
-	return indexWithMetaChecksum, status, nil
+	return entry, status, nil
 }
 
 // DecodeIndexSummary decodes index summary.
@@ -482,13 +483,13 @@ func (dec *Decoder) decodeIndexEntry(bytesPool pool.BytesPool) schema.IndexEntry
 	return indexEntry
 }
 
-func (dec *Decoder) decodeIndexChecksum(
+func (dec *Decoder) decodeWideEntry(
 	compareID []byte,
 	bytesPool pool.BytesPool,
-) (schema.IndexChecksum, IndexChecksumLookupStatus) {
+) (schema.WideEntry, WideEntryLookupStatus) {
 	entry := dec.decodeIndexEntry(bytesPool)
 	if dec.err != nil {
-		return schema.IndexChecksum{}, ErrorLookupStatus
+		return emptyWideEntry, ErrorLookupStatus
 	}
 
 	if entry.EncodedTags == nil {
@@ -496,8 +497,8 @@ func (dec *Decoder) decodeIndexChecksum(
 			bytesPool.Put(entry.ID)
 		}
 
-		dec.err = fmt.Errorf("decode index checksum requires files V1+")
-		return schema.IndexChecksum{}, ErrorLookupStatus
+		dec.err = fmt.Errorf("decode wide index requires files V1+")
+		return emptyWideEntry, ErrorLookupStatus
 	}
 
 	compare := bytes.Compare(compareID, entry.ID)
@@ -505,7 +506,7 @@ func (dec *Decoder) decodeIndexChecksum(
 	if compare == 0 {
 		// NB: need to compute hash before freeing entry bytes.
 		checksum = dec.hasher.HashIndexEntry(entry)
-		return schema.IndexChecksum{
+		return schema.WideEntry{
 			IndexEntry:       entry,
 			MetadataChecksum: checksum,
 		}, MatchedLookupStatus
@@ -518,12 +519,12 @@ func (dec *Decoder) decodeIndexChecksum(
 
 	if compare > 0 {
 		// compareID can still exist after the current entry.ID
-		return schema.IndexChecksum{}, MismatchLookupStatus
+		return emptyWideEntry, MismatchLookupStatus
 	}
 
 	// compareID must have been before the curret entry.ID, so this
 	// ID will not be matched.
-	return schema.IndexChecksum{}, NotFoundLookupStatus
+	return emptyWideEntry, NotFoundLookupStatus
 }
 
 func (dec *Decoder) decodeIndexSummary() (schema.IndexSummary, IndexSummaryToken) {
diff --git a/src/dbnode/persist/fs/msgpack/decoder_test.go b/src/dbnode/persist/fs/msgpack/decoder_test.go
index 126f92bea5..56a40fcb04 100644
--- a/src/dbnode/persist/fs/msgpack/decoder_test.go
+++ b/src/dbnode/persist/fs/msgpack/decoder_test.go
@@ -269,7 +269,7 @@ func TestDecodeBytesAllocNew(t *testing.T) {
 	require.Equal(t, []byte("testIndexEntry"), res.ID)
 }
 
-func TestDecodeIndexEntryInvalidChecksum(t *testing.T) {
+func TestDecodeIndexEntryInvalidWideEntry(t *testing.T) {
 	var (
 		enc = NewEncoder()
 		dec = NewDecoder(nil)
@@ -299,9 +299,9 @@ func TestDecodeIndexEntryIncompleteFile(t *testing.T) {
 	require.EqualError(t, err, "decode index entry encountered error: EOF")
 }
 
-var decodeIndexChecksumTests = []struct {
+var decodeWideEntryTests = []struct {
 	id         string
-	exStatus   IndexChecksumLookupStatus
+	exStatus   WideEntryLookupStatus
 	exChecksum int64
 }{
 	{id: "aaa", exStatus: NotFoundLookupStatus},
@@ -309,19 +309,19 @@ var decodeIndexChecksumTests = []struct {
 	{id: "zzz", exStatus: MismatchLookupStatus},
 }
 
-func TestDecodeIndexEntryToIndexChecksum(t *testing.T) {
+func TestDecodeToWideEntry(t *testing.T) {
 	var (
 		enc = NewEncoder()
 		dec = NewDecoder(NewDecodingOptions().SetIndexEntryHasher(xhash.NewParsedIndexHasher(t)))
 	)
 
-	require.NoError(t, enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry))
+	require.NoError(t, enc.EncodeIndexEntry(testWideEntry.IndexEntry))
 	data := enc.Bytes()
 
-	for _, tt := range decodeIndexChecksumTests {
+	for _, tt := range decodeWideEntryTests {
 		t.Run(tt.id, func(t *testing.T) {
 			dec.Reset(NewByteDecoderStream(data))
-			res, status, err := dec.DecodeIndexEntryToIndexChecksum([]byte(tt.id), nil)
+			res, status, err := dec.DecodeToWideEntry([]byte(tt.id), nil)
 			require.NoError(t, err)
 			require.Equal(t, tt.exStatus, status)
 			if tt.exStatus == MatchedLookupStatus {
@@ -331,7 +331,7 @@ func TestDecodeIndexEntryToIndexChecksum(t *testing.T) {
 	}
 }
 
-func TestDecodeIndexEntryToIndexChecksumPooled(t *testing.T) {
+func TestDecodeToWideEntryPooled(t *testing.T) {
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
 
@@ -340,19 +340,19 @@ func TestDecodeIndexEntryToIndexChecksumPooled(t *testing.T) {
 		dec = NewDecoder(NewDecodingOptions().SetIndexEntryHasher(xhash.NewParsedIndexHasher(t)))
 	)
 
-	require.NoError(t, enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry))
+	require.NoError(t, enc.EncodeIndexEntry(testWideEntry.IndexEntry))
 	data := enc.Bytes()
 
-	for _, tt := range decodeIndexChecksumTests {
+	for _, tt := range decodeWideEntryTests {
 		t.Run(tt.id+"_pooled", func(t *testing.T) {
 			dec.Reset(NewByteDecoderStream(data))
 
 			bytePool := pool.NewMockBytesPool(ctrl)
-			idLength := len(testIndexCheksumEntry.ID)
+			idLength := len(testWideEntry.ID)
 			idBytes := make([]byte, idLength)
 			bytePool.EXPECT().Get(idLength).Return(idBytes)
 
-			tagLength := len(testIndexCheksumEntry.EncodedTags)
+			tagLength := len(testWideEntry.EncodedTags)
 			tagBytes := make([]byte, tagLength)
 			bytePool.EXPECT().Get(tagLength).Return(tagBytes)
 
@@ -361,7 +361,7 @@ func TestDecodeIndexEntryToIndexChecksumPooled(t *testing.T) {
 				bytePool.EXPECT().Put(tagBytes)
 			}
 
-			res, status, err := dec.DecodeIndexEntryToIndexChecksum([]byte(tt.id), bytePool)
+			res, status, err := dec.DecodeToWideEntry([]byte(tt.id), bytePool)
 			require.NoError(t, err)
 			require.Equal(t, tt.exStatus, status)
 			if tt.exStatus == MatchedLookupStatus {
diff --git a/src/dbnode/persist/fs/msgpack/roundtrip_test.go b/src/dbnode/persist/fs/msgpack/roundtrip_test.go
index 46c98c20d0..251d2f476e 100644
--- a/src/dbnode/persist/fs/msgpack/roundtrip_test.go
+++ b/src/dbnode/persist/fs/msgpack/roundtrip_test.go
@@ -92,10 +92,10 @@ var (
 		EncodedTags: []byte("testLogMetadataTags"),
 	}
 
-	// NB: 100 is the expected hash for checksums of `testIndexCheksumEntry`
+	// NB: 100 is the expected hash for checksums of `testWideEntry`
 	testMetadataChecksum = int64(100)
 
-	testIndexCheksumEntry = schema.IndexChecksum{
+	testWideEntry = schema.WideEntry{
 		IndexEntry: schema.IndexEntry{
 			Index:         234,
 			ID:            []byte("test100"),
@@ -403,7 +403,7 @@ func TestIndexEntryRoundtrip(t *testing.T) {
 	require.Equal(t, testIndexEntry, res)
 }
 
-func TestIndexEntryIntoIndexChecksumRoundtripWithBytesPool(t *testing.T) {
+func TestIndexEntryIntoWideEntryRoundtripWithBytesPool(t *testing.T) {
 	var (
 		pool = pool.NewBytesPool(nil, nil)
 		enc  = NewEncoder()
@@ -411,26 +411,26 @@ func TestIndexEntryIntoIndexChecksumRoundtripWithBytesPool(t *testing.T) {
 	)
 	pool.Init()
 
-	require.NoError(t, enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry))
+	require.NoError(t, enc.EncodeIndexEntry(testWideEntry.IndexEntry))
 	dec.Reset(NewByteDecoderStream(enc.Bytes()))
-	checksum, _, err := dec.DecodeIndexEntryToIndexChecksum(testIndexCheksumEntry.ID, pool)
+	checksum, _, err := dec.DecodeToWideEntry(testWideEntry.ID, pool)
 	require.NoError(t, err)
-	require.Equal(t, testIndexCheksumEntry.IndexEntry, checksum.IndexEntry)
-	require.Equal(t, testIndexCheksumEntry.MetadataChecksum, checksum.MetadataChecksum)
+	require.Equal(t, testWideEntry.IndexEntry, checksum.IndexEntry)
+	require.Equal(t, testWideEntry.MetadataChecksum, checksum.MetadataChecksum)
 }
 
-func TestIndexEntryIntoIndexChecksumRoundtripWithoutBytesPool(t *testing.T) {
+func TestIndexEntryIntoWideEntryRoundtripWithoutBytesPool(t *testing.T) {
 	var (
 		enc = NewEncoder()
 		dec = NewDecoder(NewDecodingOptions().SetIndexEntryHasher(xhash.NewParsedIndexHasher(t)))
 	)
 
-	require.NoError(t, enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry))
+	require.NoError(t, enc.EncodeIndexEntry(testWideEntry.IndexEntry))
 	dec.Reset(NewByteDecoderStream(enc.Bytes()))
-	checksum, _, err := dec.DecodeIndexEntryToIndexChecksum(testIndexCheksumEntry.ID, nil)
+	checksum, _, err := dec.DecodeToWideEntry(testWideEntry.ID, nil)
 	require.NoError(t, err)
-	require.Equal(t, testIndexCheksumEntry.IndexEntry, checksum.IndexEntry)
-	require.Equal(t, testIndexCheksumEntry.MetadataChecksum, checksum.MetadataChecksum)
+	require.Equal(t, testWideEntry.IndexEntry, checksum.IndexEntry)
+	require.Equal(t, testWideEntry.MetadataChecksum, checksum.MetadataChecksum)
 }
 
 // Make sure the V3 decoding code can handle the V1 file format.
@@ -450,32 +450,34 @@ func TestIndexEntryRoundTripBackwardsCompatibilityV1(t *testing.T) {
 	// and then restore them at the end of the test - This is required
 	// because the new decoder won't try and read the new fields from
 	// the old file format.
-	currEncodedTags := testIndexCheksumEntry.EncodedTags
+	currEncodedTags := testWideEntry.EncodedTags
 
-	testIndexCheksumEntry.EncodedTags = nil
+	testWideEntry.EncodedTags = nil
 
 	defer func() {
-		testIndexCheksumEntry.EncodedTags = currEncodedTags
+		testWideEntry.EncodedTags = currEncodedTags
 	}()
 
-	enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry)
+	err := enc.EncodeIndexEntry(testWideEntry.IndexEntry)
+	require.NoError(t, err)
+
 	bytes := enc.Bytes()
 	cloned := append(make([]byte, 0, len(bytes)), bytes...)
 	dec.Reset(NewByteDecoderStream(bytes))
 	res, err := dec.DecodeIndexEntry(nil)
 	require.NoError(t, err)
-	expected := testIndexCheksumEntry.IndexEntry
+	expected := testWideEntry.IndexEntry
 	expected.IndexChecksum = 0
 	require.Equal(t, expected, res)
 
-	// Index Checksum decoding should fail since it requires tags for now.
+	// Wide Entry decoding should fail since it requires tags for now.
 	dec.Reset(NewByteDecoderStream(cloned))
 	pool := pool.NewMockBytesPool(ctrl)
-	idLength := len(testIndexCheksumEntry.ID)
+	idLength := len(testWideEntry.ID)
 	idBytes := make([]byte, idLength)
 	pool.EXPECT().Get(idLength).Return(idBytes)
 	pool.EXPECT().Put(idBytes)
-	_, status, err := dec.DecodeIndexEntryToIndexChecksum(testIndexCheksumEntry.ID, pool)
+	_, status, err := dec.DecodeToWideEntry(testWideEntry.ID, pool)
 	require.Error(t, err)
 	assert.Equal(t, ErrorLookupStatus, status)
 }
@@ -495,13 +497,14 @@ func TestIndexEntryRoundTripForwardsCompatibilityV1(t *testing.T) {
 	// Set the default values on the fields that did not exist in V1
 	// and then restore them at the end of the test - This is required
 	// because the old decoder won't read the new fields.
-	currEncodedTags := testIndexCheksumEntry.EncodedTags
+	currEncodedTags := testWideEntry.EncodedTags
 
-	enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry)
+	err := enc.EncodeIndexEntry(testWideEntry.IndexEntry)
+	require.NoError(t, err)
 
 	// Make sure to zero them before we compare, but after we have
 	// encoded the data.
-	expected := testIndexCheksumEntry.IndexEntry
+	expected := testWideEntry.IndexEntry
 	expected.EncodedTags = nil
 	defer func() {
 		expected.EncodedTags = currEncodedTags
@@ -516,11 +519,11 @@ func TestIndexEntryRoundTripForwardsCompatibilityV1(t *testing.T) {
 
 	dec.Reset(NewByteDecoderStream(enc.Bytes()))
 	pool := pool.NewMockBytesPool(ctrl)
-	idLength := len(testIndexCheksumEntry.ID)
+	idLength := len(testWideEntry.ID)
 	idBytes := make([]byte, idLength)
 	pool.EXPECT().Get(idLength).Return(idBytes)
 	pool.EXPECT().Put(idBytes)
-	_, status, err := dec.DecodeIndexEntryToIndexChecksum(testIndexCheksumEntry.ID, pool)
+	_, status, err := dec.DecodeToWideEntry(testWideEntry.ID, pool)
 	require.Error(t, err)
 	assert.Equal(t, ErrorLookupStatus, status)
 }
@@ -538,23 +541,24 @@ func TestIndexEntryRoundTripBackwardsCompatibilityV2(t *testing.T) {
 	// and decoder and is never set on the IndexEntry struct. Therefore, no need to zero out any field in the struct
 	// to make a comparison.
 
-	enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry)
+	err := enc.EncodeIndexEntry(testWideEntry.IndexEntry)
+	require.NoError(t, err)
 	dec.Reset(NewByteDecoderStream(enc.Bytes()))
 	res, err := dec.DecodeIndexEntry(nil)
 	require.NoError(t, err)
-	expected := testIndexCheksumEntry.IndexEntry
+	expected := testWideEntry.IndexEntry
 	expected.IndexChecksum = 0
 	require.Equal(t, expected, res)
 
 	dec.Reset(NewByteDecoderStream(enc.Bytes()))
-	chk, status, err := dec.DecodeIndexEntryToIndexChecksum(testIndexCheksumEntry.ID, nil)
+	chk, status, err := dec.DecodeToWideEntry(testWideEntry.ID, nil)
 	require.NoError(t, err)
 	assert.Equal(t, MatchedLookupStatus, status)
-	ex := testIndexCheksumEntry.IndexEntry
+	ex := testWideEntry.IndexEntry
 	// This file version does not have an IndexChecksum field.
 	ex.IndexChecksum = 0
 	require.Equal(t, ex, chk.IndexEntry)
-	require.Equal(t, testIndexCheksumEntry.MetadataChecksum, chk.MetadataChecksum)
+	require.Equal(t, testWideEntry.MetadataChecksum, chk.MetadataChecksum)
 }
 
 // Make sure the V2 decoder code can handle the V3 file format.
@@ -569,23 +573,24 @@ func TestIndexEntryRoundTripForwardsCompatibilityV2(t *testing.T) {
 	// and decoder and is never set on the IndexEntry struct. Therefore, no need to zero out any field in the struct
 	// to make a comparison.
 
-	enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry)
+	err := enc.EncodeIndexEntry(testWideEntry.IndexEntry)
+	require.NoError(t, err)
 	dec.Reset(NewByteDecoderStream(enc.Bytes()))
 	res, err := dec.DecodeIndexEntry(nil)
 	require.NoError(t, err)
-	expected := testIndexCheksumEntry.IndexEntry
+	expected := testWideEntry.IndexEntry
 	expected.IndexChecksum = 0
 	require.Equal(t, expected, res)
 
 	dec.Reset(NewByteDecoderStream(enc.Bytes()))
-	chk, status, err := dec.DecodeIndexEntryToIndexChecksum(testIndexCheksumEntry.ID, nil)
+	chk, status, err := dec.DecodeToWideEntry(testWideEntry.ID, nil)
 	require.NoError(t, err)
 	assert.Equal(t, MatchedLookupStatus, status)
-	ex := testIndexCheksumEntry.IndexEntry
+	ex := testWideEntry.IndexEntry
 	// This file version does not have an IndexChecksum field.
 	ex.IndexChecksum = 0
 	require.Equal(t, ex, chk.IndexEntry)
-	require.Equal(t, testIndexCheksumEntry.MetadataChecksum, chk.MetadataChecksum)
+	require.Equal(t, testWideEntry.MetadataChecksum, chk.MetadataChecksum)
 }
 
 func TestIndexSummaryRoundtrip(t *testing.T) {
@@ -682,7 +687,7 @@ func TestMultiTypeRoundtripStress(t *testing.T) {
 			require.NoError(t, enc.EncodeLogMetadata(testLogMetadata))
 			expected = append(expected, testLogMetadata)
 		case 5:
-			require.NoError(t, enc.EncodeIndexEntry(testIndexCheksumEntry.IndexEntry))
+			require.NoError(t, enc.EncodeIndexEntry(testWideEntry.IndexEntry))
 			expected = append(expected, testMetadataChecksum)
 		}
 	}
@@ -702,12 +707,12 @@ func TestMultiTypeRoundtripStress(t *testing.T) {
 			res, err = dec.DecodeLogMetadata()
 		case 5:
 			var (
-				r schema.IndexChecksum
-				s IndexChecksumLookupStatus
+				e schema.WideEntry
+				s WideEntryLookupStatus
 			)
-			r, s, err = dec.DecodeIndexEntryToIndexChecksum(testIndexCheksumEntry.ID, nil)
+			e, s, err = dec.DecodeToWideEntry(testWideEntry.ID, nil)
 			assert.Equal(t, s, MatchedLookupStatus)
-			res = r.MetadataChecksum
+			res = e.MetadataChecksum
 		}
 		require.NoError(t, err)
 		output = append(output, res)
diff --git a/src/dbnode/persist/fs/retriever.go b/src/dbnode/persist/fs/retriever.go
index 0d96ceae25..7d9eca227e 100644
--- a/src/dbnode/persist/fs/retriever.go
+++ b/src/dbnode/persist/fs/retriever.go
@@ -33,19 +33,18 @@ package fs
 
 import (
 	"errors"
+	"fmt"
 	"sort"
 	"sync"
 	"sync/atomic"
 	"time"
 
 	"github.com/m3db/m3/src/dbnode/namespace"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/storage/block"
 	"github.com/m3db/m3/src/dbnode/storage/limits"
 	"github.com/m3db/m3/src/dbnode/ts"
 	"github.com/m3db/m3/src/dbnode/x/xio"
-	"github.com/m3db/m3/src/x/checked"
 	"github.com/m3db/m3/src/x/context"
 	"github.com/m3db/m3/src/x/ident"
 	"github.com/m3db/m3/src/x/pool"
@@ -68,8 +67,7 @@ const (
 
 	streamInvalidReq streamReqType = iota
 	streamDataReq
-	streamIdxChecksumReq
-	streamReadMismatchReq
+	streamWideEntryReq
 )
 
 type blockRetrieverStatus int
@@ -201,9 +199,10 @@ func (r *blockRetriever) AssignShardSet(shardSet sharding.ShardSet) {
 
 func (r *blockRetriever) fetchLoop(seekerMgr DataFileSetSeekerManager) {
 	var (
-		seekerResources = NewReusableSeekerResources(r.fsOpts)
-		inFlight        []*retrieveRequest
-		currBatchReqs   []*retrieveRequest
+		seekerResources    = NewReusableSeekerResources(r.fsOpts)
+		retrieverResources = newReuseableRetrieverResources()
+		inFlight           []*retrieveRequest
+		currBatchReqs      []*retrieveRequest
 	)
 	for {
 		// Free references to the inflight requests
@@ -266,8 +265,8 @@ func (r *blockRetriever) fetchLoop(seekerMgr DataFileSetSeekerManager) {
 				req.shard != currBatchShard {
 				// Fetch any outstanding in the current batch
 				if len(currBatchReqs) > 0 {
-					r.fetchBatch(
-						seekerMgr, currBatchShard, currBatchStart, currBatchReqs, seekerResources)
+					r.fetchBatch(seekerMgr, currBatchShard, currBatchStart,
+						currBatchReqs, seekerResources, retrieverResources)
 					for i := range currBatchReqs {
 						currBatchReqs[i] = nil
 					}
@@ -285,8 +284,8 @@ func (r *blockRetriever) fetchLoop(seekerMgr DataFileSetSeekerManager) {
 
 		// Fetch any finally outstanding in the current batch
 		if len(currBatchReqs) > 0 {
-			r.fetchBatch(
-				seekerMgr, currBatchShard, currBatchStart, currBatchReqs, seekerResources)
+			r.fetchBatch(seekerMgr, currBatchShard, currBatchStart,
+				currBatchReqs, seekerResources, retrieverResources)
 			for i := range currBatchReqs {
 				currBatchReqs[i] = nil
 			}
@@ -297,73 +296,69 @@ func (r *blockRetriever) fetchLoop(seekerMgr DataFileSetSeekerManager) {
 	r.fetchLoopsHaveShutdownCh <- struct{}{}
 }
 
-func (r *blockRetriever) processIndexChecksumRequest(
-	req *retrieveRequest,
-	seeker ConcurrentDataFileSetSeeker,
-	seekerResources ReusableSeekerResources,
-) {
-	checksum, err := seeker.SeekIndexEntryToIndexChecksum(req.id, seekerResources)
-	if err != nil {
-		req.onError(err)
-		return
-	}
-
-	req.onIndexChecksumCompleted(checksum)
-	req.onCallerOrRetrieverDone()
-}
-
-func (r *blockRetriever) processReadMismatchRequest(
-	req *retrieveRequest,
-	seeker ConcurrentDataFileSetSeeker,
-	seekerResources ReusableSeekerResources,
-) {
-	checksum, err := seeker.SeekIndexEntryToIndexChecksum(req.id, seekerResources)
-	if err != nil {
-		req.onError(err)
-		return
-	}
-
-	mismatch, err := seeker.SeekReadMismatchesByIndexChecksum(
-		checksum, req.mismatchChecker, seekerResources)
-
-	if err != nil && err != errSeekIDNotFound {
-		req.onError(err)
-		return
-	}
-
-	if err == errSeekIDNotFound {
-		req.onIndexMismatchCompleted(wide.ReadMismatch{})
-		return
-	}
-
-	req.onIndexMismatchCompleted(mismatch)
-	req.onCallerOrRetrieverDone()
-}
-
 // filterAndCompleteWideReqs completes all wide operation retrieve requests,
 // returning a list of requests that need to be processed by other means.
 func (r *blockRetriever) filterAndCompleteWideReqs(
 	reqs []*retrieveRequest,
 	seeker ConcurrentDataFileSetSeeker,
 	seekerResources ReusableSeekerResources,
+	retrieverResources *reuseableRetrieverResources,
 ) []*retrieveRequest {
-	filteredStreamRequests := reqs[:0]
+	retrieverResources.resetDataReqs()
+	retrieverResources.resetWideEntryReqs()
 	for _, req := range reqs {
 		switch req.streamReqType {
 		case streamDataReq:
 			// NB: filter out stream requests; these are handled outside of
 			// wide logic functions.
-			filteredStreamRequests = append(filteredStreamRequests, req)
-		case streamIdxChecksumReq:
-			r.processIndexChecksumRequest(req, seeker, seekerResources)
-		case streamReadMismatchReq:
-			r.processReadMismatchRequest(req, seeker, seekerResources)
+			retrieverResources.dataReqs = append(retrieverResources.dataReqs, req)
+
+		case streamWideEntryReq:
+			entry, err := seeker.SeekWideEntry(req.id, seekerResources)
+			if err != nil {
+				if errors.Is(err, errSeekIDNotFound) {
+					// Missing, return empty result, successful lookup.
+					req.wideEntry = xio.WideEntry{}
+					req.success = true
+				} else {
+					req.err = err
+				}
+
+				continue
+			}
+
+			// Enqueue for fetch in batch in offset ascending order.
+			req.wideEntry = entry
+			entry.Shard = req.shard
+			retrieverResources.appendWideEntryReq(req)
+
 		default:
-			req.onError(errUnsetRequestType)
+			req.err = errUnsetRequestType
 		}
 	}
 
-	return filteredStreamRequests
+	// Fulfill the wide entry data fetches in batch offset ascending.
+	sortByOffsetAsc := retrieveRequestByWideEntryOffsetAsc(retrieverResources.wideEntryReqs)
+	sort.Sort(sortByOffsetAsc)
+	for _, req := range retrieverResources.wideEntryReqs {
+		entry := IndexEntry{
+			Size:         uint32(req.wideEntry.Size),
+			DataChecksum: uint32(req.wideEntry.DataChecksum),
+			Offset:       req.wideEntry.Offset,
+		}
+		data, err := seeker.SeekByIndexEntry(entry, seekerResources)
+		if err != nil {
+			req.err = err
+			continue
+		}
+
+		// Success, inc ref so on finalize can decref and finalize.
+		req.wideEntry.Data = data
+		req.wideEntry.Data.IncRef()
+		req.success = true
+	}
+
+	return retrieverResources.dataReqs
 }
 
 func (r *blockRetriever) fetchBatch(
@@ -372,18 +367,43 @@ func (r *blockRetriever) fetchBatch(
 	blockStart time.Time,
 	allReqs []*retrieveRequest,
 	seekerResources ReusableSeekerResources,
+	retrieverResources *reuseableRetrieverResources,
 ) {
-	// Resolve the seeker from the seeker mgr
-	seeker, err := seekerMgr.Borrow(shard, blockStart)
-	if err != nil {
+	var (
+		seeker     ConcurrentDataFileSetSeeker
+		callbackWg sync.WaitGroup
+	)
+
+	defer func() {
+		filteredReqs := allReqs[:0]
+		// Make sure requests are always fulfilled so if there's a code bug
+		// then errSeekNotCompleted is returned because req.success is not set
+		// rather than we have dangling goroutines stacking up.
 		for _, req := range allReqs {
-			req.onError(err)
+			if !req.waitingForCallback {
+				req.onDone()
+				continue
+			}
+
+			filteredReqs = append(filteredReqs, req)
 		}
-		return
-	}
 
-	defer func() {
-		err = seekerMgr.Return(shard, blockStart, seeker)
+		callbackWg.Wait()
+		for _, req := range filteredReqs {
+			req.onDone()
+		}
+
+		// Reset resources to free any pointers in the slices still pointing
+		// to requests that are now completed and returned to pools.
+		retrieverResources.resetAll()
+
+		if seeker == nil {
+			// No borrowed seeker to return.
+			return
+		}
+
+		// Return borrowed seeker.
+		err := seekerMgr.Return(shard, blockStart, seeker)
 		if err != nil {
 			r.logger.Error("err returning seeker for shard",
 				zap.Uint32("shard", shard),
@@ -393,86 +413,106 @@ func (r *blockRetriever) fetchBatch(
 		}
 	}()
 
+	var err error
+	seeker, err = seekerMgr.Borrow(shard, blockStart)
+	if err != nil {
+		for _, req := range allReqs {
+			req.err = err
+		}
+		return
+	}
+
 	// NB: filterAndCompleteWideReqs will complete any wide requests, returning
 	// a filtered list of requests that should be processed below. These wide
 	// requests must not take query limits into account.
-	reqs := r.filterAndCompleteWideReqs(allReqs, seeker, seekerResources)
+	reqs := r.filterAndCompleteWideReqs(allReqs, seeker, seekerResources,
+		retrieverResources)
 
 	var limitErr error
 	if err := r.queryLimits.AnyExceeded(); err != nil {
 		for _, req := range reqs {
-			req.onError(err)
+			req.err = err
 		}
 		return
 	}
 
 	for _, req := range reqs {
 		if limitErr != nil {
-			req.onError(limitErr)
+			req.err = limitErr
 			continue
 		}
 
 		entry, err := seeker.SeekIndexEntry(req.id, seekerResources)
-		if err != nil && err != errSeekIDNotFound {
-			req.onError(err)
+		if err != nil && !errors.Is(err, errSeekIDNotFound) {
+			req.err = err
 			continue
 		}
 
 		if err := r.bytesReadLimit.Inc(int(entry.Size)); err != nil {
-			req.onError(err)
+			req.err = err
 			limitErr = err
 			continue
 		}
 
-		if err == errSeekIDNotFound {
+		if errors.Is(err, errSeekIDNotFound) {
 			req.notFound = true
 		}
 
 		req.indexEntry = entry
 	}
 
-	sort.Sort(retrieveRequestByOffsetAsc(reqs))
+	sort.Sort(retrieveRequestByIndexEntryOffsetAsc(reqs))
 	tagDecoderPool := r.fsOpts.TagDecoderPool()
 
 	blockCachingEnabled := r.opts.CacheBlocksOnRetrieve() && r.nsCacheBlocksOnRetrieve
 
 	// Seek and execute all requests
 	for _, req := range reqs {
-		var (
-			data checked.Bytes
-			err  error
-		)
+		// Should always be a data request by this point.
+		if req.streamReqType != streamDataReq {
+			req.err = fmt.Errorf("wrong stream req type: expect=%d, actual=%d",
+				streamDataReq, req.streamReqType)
+			continue
+		}
 
-		// Only try to seek the ID if it exists and there haven't been any errors so
-		// far, otherwise we'll get a checksum mismatch error because the default
-		// offset value for indexEntry is zero.
-		if req.foundAndHasNoError() {
-			data, err = seeker.SeekByIndexEntry(req.indexEntry, seekerResources)
-			if err != nil && err != errSeekIDNotFound {
-				req.onError(err)
-				continue
-			}
+		if req.err != nil {
+			// Skip requests with error, will already get appropriate callback.
+			continue
+		}
+
+		if req.notFound {
+			// Only try to seek the ID if it exists and there haven't been any errors so
+			// far, otherwise we'll get a checksum mismatch error because the default
+			// offset value for indexEntry is zero.
+			req.success = true
+			req.onCallerOrRetrieverDone()
+			continue
+		}
+
+		data, err := seeker.SeekByIndexEntry(req.indexEntry, seekerResources)
+		if err != nil {
+			// If not found error is returned here, that's still an error since
+			// it's expected to be found if it was found in the index file.
+			req.err = err
+			continue
 		}
 
 		var (
 			seg, onRetrieveSeg ts.Segment
 			checksum           = req.indexEntry.DataChecksum
 		)
-		if data != nil {
-			seg = ts.NewSegment(data, nil, checksum, ts.FinalizeHead)
-		}
+		seg = ts.NewSegment(data, nil, checksum, ts.FinalizeHead)
 
 		// We don't need to call onRetrieve.OnRetrieveBlock if the ID was not found.
-		callOnRetrieve := blockCachingEnabled && req.onRetrieve != nil && req.foundAndHasNoError()
+		callOnRetrieve := blockCachingEnabled && req.onRetrieve != nil
 		if callOnRetrieve {
 			// NB(r): Need to also trigger callback with a copy of the data.
 			// This is used by the database to cache the in memory data for
 			// consequent fetches.
-			if data != nil {
-				dataCopy := r.bytesPool.Get(data.Len())
-				onRetrieveSeg = ts.NewSegment(dataCopy, nil, checksum, ts.FinalizeHead)
-				dataCopy.AppendAll(data.Bytes())
-			}
+			dataCopy := r.bytesPool.Get(data.Len())
+			onRetrieveSeg = ts.NewSegment(dataCopy, nil, checksum, ts.FinalizeHead)
+			dataCopy.AppendAll(data.Bytes())
+
 			if tags := req.indexEntry.EncodedTags; tags != nil && tags.Len() > 0 {
 				decoder := tagDecoderPool.Get()
 				// DecRef because we're transferring ownership from the index entry to
@@ -492,17 +532,24 @@ func (r *blockRetriever) fetchBatch(
 
 		// Complete request.
 		req.onRetrieved(seg, req.nsCtx)
+		req.success = true
 
 		if !callOnRetrieve {
-			// No need to call the onRetrieve callback.
+			// No need to call the onRetrieve callback, but do need to call
+			// onCallerOrRetrieverDone since data requests do not get finalized
+			// when req.onDone is called since sometimes they need deferred
+			// finalization (when callOnRetrieve is true).
 			req.onCallerOrRetrieverDone()
 			continue
 		}
 
+		callbackWg.Add(1)
+		req.waitingForCallback = true
 		go func(r *retrieveRequest) {
 			// Call the onRetrieve callback and finalize.
 			r.onRetrieve.OnRetrieveBlock(r.id, r.tags, r.start, onRetrieveSeg, r.nsCtx)
 			r.onCallerOrRetrieverDone()
+			callbackWg.Done()
 		}(req)
 	}
 }
@@ -586,11 +633,14 @@ func (r *blockRetriever) Stream(
 
 	found, err := r.streamRequest(ctx, req, shard, id, startTime, nsCtx)
 	if err != nil {
+		req.resultWg.Done()
 		return xio.EmptyBlockReader, err
 	}
 
 	if !found {
 		req.onRetrieved(ts.Segment{}, namespace.Context{})
+		req.success = true
+		req.onDone()
 	}
 
 	// The request may not have completed yet, but it has an internal
@@ -601,52 +651,26 @@ func (r *blockRetriever) Stream(
 	return req.toBlock(), nil
 }
 
-func (r *blockRetriever) StreamIndexChecksum(
-	ctx context.Context,
-	shard uint32,
-	id ident.ID,
-	startTime time.Time,
-	nsCtx namespace.Context,
-) (block.StreamedChecksum, error) {
-	req := r.reqPool.Get()
-	req.streamReqType = streamIdxChecksumReq
-
-	found, err := r.streamRequest(ctx, req, shard, id, startTime, nsCtx)
-	if err != nil {
-		return block.EmptyStreamedChecksum, err
-	}
-
-	if !found {
-		req.onIndexChecksumCompleted(xio.IndexChecksum{})
-	}
-
-	// The request may not have completed yet, but it has an internal
-	// waitgroup which the caller will have to wait for before retrieving
-	// the data. This means that even though we're returning nil for error
-	// here, the caller may still encounter an error when they attempt to
-	// read the data.
-	return req, nil
-}
-
-func (r *blockRetriever) StreamReadMismatches(
+func (r *blockRetriever) StreamWideEntry(
 	ctx context.Context,
 	shard uint32,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
 	id ident.ID,
 	startTime time.Time,
 	nsCtx namespace.Context,
-) (wide.StreamedMismatch, error) {
+) (block.StreamedWideEntry, error) {
 	req := r.reqPool.Get()
-	req.mismatchChecker = mismatchChecker
-	req.streamReqType = streamReadMismatchReq
+	req.streamReqType = streamWideEntryReq
 
 	found, err := r.streamRequest(ctx, req, shard, id, startTime, nsCtx)
 	if err != nil {
-		return wide.EmptyStreamedMismatch, err
+		req.resultWg.Done()
+		return block.EmptyStreamedWideEntry, err
 	}
 
 	if !found {
-		req.onIndexMismatchCompleted(wide.ReadMismatch{})
+		req.wideEntry = xio.WideEntry{}
+		req.success = true
+		req.onDone()
 	}
 
 	// The request may not have completed yet, but it has an internal
@@ -740,7 +764,9 @@ func (reqs *shardRetrieveRequests) resetQueued() {
 
 // Don't forget to update the resetForReuse method when adding a new field
 type retrieveRequest struct {
-	resultWg sync.WaitGroup
+	finalized          bool
+	waitingForCallback bool
+	resultWg           sync.WaitGroup
 
 	pool *reqPool
 
@@ -753,12 +779,9 @@ type retrieveRequest struct {
 
 	streamReqType streamReqType
 	indexEntry    IndexEntry
-	indexChecksum xio.IndexChecksum
-	mismatchBatch wide.ReadMismatch
+	wideEntry     xio.WideEntry
 	reader        xio.SegmentReader
 
-	mismatchChecker wide.EntryChecksumMismatchChecker
-
 	err error
 
 	// Finalize requires two calls to finalize (once both the user of the
@@ -768,38 +791,16 @@ type retrieveRequest struct {
 	shard     uint32
 
 	notFound bool
+	success  bool
 }
 
-func (req *retrieveRequest) onIndexChecksumCompleted(indexChecksum xio.IndexChecksum) {
-	if req.err == nil {
-		req.indexChecksum = indexChecksum
-		// If there was an error, we've already called done.
-		req.resultWg.Done()
-	}
-}
-
-func (req *retrieveRequest) RetrieveIndexChecksum() (xio.IndexChecksum, error) {
+func (req *retrieveRequest) RetrieveWideEntry() (xio.WideEntry, error) {
 	req.resultWg.Wait()
 	if req.err != nil {
-		return xio.IndexChecksum{}, req.err
-	}
-	return req.indexChecksum, nil
-}
-
-func (req *retrieveRequest) onIndexMismatchCompleted(batch wide.ReadMismatch) {
-	if req.err == nil {
-		req.mismatchBatch = batch
-		// If there was an error, we've already called done.
-		req.resultWg.Done()
+		return xio.WideEntry{}, req.err
 	}
-}
 
-func (req *retrieveRequest) RetrieveMismatch() (wide.ReadMismatch, error) {
-	req.resultWg.Wait()
-	if req.err != nil {
-		return wide.ReadMismatch{}, req.err
-	}
-	return req.mismatchBatch, nil
+	return req.wideEntry, nil
 }
 
 func (req *retrieveRequest) toBlock() xio.BlockReader {
@@ -810,49 +811,84 @@ func (req *retrieveRequest) toBlock() xio.BlockReader {
 	}
 }
 
-func (req *retrieveRequest) onError(err error) {
-	if req.err == nil {
-		req.err = err
-		req.resultWg.Done()
-	}
-}
-
 func (req *retrieveRequest) onRetrieved(segment ts.Segment, nsCtx namespace.Context) {
-	req.Reset(segment)
 	req.nsCtx = nsCtx
+	req.Reset(segment)
 }
 
-func (req *retrieveRequest) onCallerOrRetrieverDone() {
-	if atomic.AddUint32(&req.finalizes, 1) != 2 {
-		return
-	}
-	// NB: streamIdxChecksumReq ids are used to sort the resultant list,
-	// so they should not be finalized here.
-	if req.streamReqType == streamDataReq {
-		req.id.Finalize()
+func (req *retrieveRequest) onDone() {
+	var (
+		err           = req.err
+		success       = req.success
+		streamReqType = req.streamReqType
+	)
+
+	if err == nil && !success {
+		// Require explicit success, otherwise this request
+		// was never completed.
+		// This helps catch code bugs where this element wasn't explicitly
+		// handled as completed during a fetch batch call instead of
+		// returning but with no actual result set properly.
+		req.err = errSeekNotCompleted
 	}
-	req.id = nil
-	if req.tags != nil {
-		req.tags.Close()
-		req.tags = ident.EmptyTagIterator
+
+	req.resultWg.Done()
+
+	switch streamReqType {
+	case streamDataReq:
+		// Do not call onCallerOrRetrieverDone since the OnRetrieveCallback
+		// code path will call req.onCallerOrRetrieverDone() when it's done.
+		// If encountered an error though, should call it since not waiting for
+		// callback to finish or even if not waiting for callback to finish
+		// the happy path that calls this pre-emptively has not executed either.
+		// That is if-and-only-if request is data request and is successful and
+		// will req.onCallerOrRetrieverDone() be called in a deferred manner.
+		if !success {
+			req.onCallerOrRetrieverDone()
+		}
+	default:
+		// All other requests will use this to increment the finalize count by
+		// one and the actual req.Finalize() by the final one to make count of
+		// two and actually return the request to the pool.
+		req.onCallerOrRetrieverDone()
 	}
-	req.reader.Finalize()
-	req.reader = nil
-	req.pool.Put(req)
 }
 
 func (req *retrieveRequest) Reset(segment ts.Segment) {
 	req.reader.Reset(segment)
-	if req.err == nil {
-		// If there was an error, we've already called done.
-		req.resultWg.Done()
-	}
 }
 
 func (req *retrieveRequest) ResetWindowed(segment ts.Segment, start time.Time, blockSize time.Duration) {
-	req.Reset(segment)
 	req.start = start
 	req.blockSize = blockSize
+	req.Reset(segment)
+}
+
+func (req *retrieveRequest) onCallerOrRetrieverDone() {
+	if atomic.AddUint32(&req.finalizes, 1) != 2 {
+		return
+	}
+
+	switch req.streamReqType {
+	case streamWideEntryReq:
+		// All pooled elements are set on the wideEntry.
+		req.wideEntry.Finalize()
+	default:
+		if req.id != nil {
+			req.id.Finalize()
+			req.id = nil
+		}
+		if req.tags != nil {
+			req.tags.Close()
+			req.tags = ident.EmptyTagIterator
+		}
+		if req.reader != nil {
+			req.reader.Finalize()
+			req.reader = nil
+		}
+	}
+
+	req.pool.Put(req)
 }
 
 func (req *retrieveRequest) SegmentReader() (xio.SegmentReader, error) {
@@ -898,11 +934,18 @@ func (req *retrieveRequest) Segment() (ts.Segment, error) {
 func (req *retrieveRequest) Finalize() {
 	// May not actually finalize the request, depending on if
 	// retriever is done too
+	if req.finalized {
+		return
+	}
+
+	req.resultWg.Wait()
+	req.finalized = true
 	req.onCallerOrRetrieverDone()
 }
 
 func (req *retrieveRequest) resetForReuse() {
 	req.resultWg = sync.WaitGroup{}
+	req.finalized = false
 	req.finalizes = 0
 	req.shard = 0
 	req.id = nil
@@ -912,16 +955,11 @@ func (req *retrieveRequest) resetForReuse() {
 	req.onRetrieve = nil
 	req.streamReqType = streamInvalidReq
 	req.indexEntry = IndexEntry{}
-	req.indexChecksum = xio.IndexChecksum{}
-	req.mismatchBatch = wide.ReadMismatch{}
-	req.mismatchChecker = nil
+	req.wideEntry = xio.WideEntry{}
 	req.reader = nil
 	req.err = nil
 	req.notFound = false
-}
-
-func (req *retrieveRequest) foundAndHasNoError() bool {
-	return !req.notFound && req.err == nil
+	req.success = false
 }
 
 type retrieveRequestByStartAscShardAsc []*retrieveRequest
@@ -935,18 +973,29 @@ func (r retrieveRequestByStartAscShardAsc) Less(i, j int) bool {
 	return r[i].shard < r[j].shard
 }
 
-type retrieveRequestByOffsetAsc []*retrieveRequest
+type retrieveRequestByIndexEntryOffsetAsc []*retrieveRequest
 
-func (r retrieveRequestByOffsetAsc) Len() int      { return len(r) }
-func (r retrieveRequestByOffsetAsc) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
-func (r retrieveRequestByOffsetAsc) Less(i, j int) bool {
+func (r retrieveRequestByIndexEntryOffsetAsc) Len() int      { return len(r) }
+func (r retrieveRequestByIndexEntryOffsetAsc) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
+func (r retrieveRequestByIndexEntryOffsetAsc) Less(i, j int) bool {
 	return r[i].indexEntry.Offset < r[j].indexEntry.Offset
 }
 
+type retrieveRequestByWideEntryOffsetAsc []*retrieveRequest
+
+func (r retrieveRequestByWideEntryOffsetAsc) Len() int      { return len(r) }
+func (r retrieveRequestByWideEntryOffsetAsc) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
+func (r retrieveRequestByWideEntryOffsetAsc) Less(i, j int) bool {
+	return r[i].wideEntry.Offset < r[j].wideEntry.Offset
+}
+
 // RetrieveRequestPool is the retrieve request pool.
 type RetrieveRequestPool interface {
+	// Init initializes the request pool.
 	Init()
+	// Get gets a retrieve request.
 	Get() *retrieveRequest
+	// Put returns a retrieve request to the pool.
 	Put(req *retrieveRequest)
 }
 
@@ -986,3 +1035,37 @@ func (p *reqPool) Put(req *retrieveRequest) {
 	req.resetForReuse()
 	p.pool.Put(req)
 }
+
+type reuseableRetrieverResources struct {
+	dataReqs      []*retrieveRequest
+	wideEntryReqs []*retrieveRequest
+}
+
+func newReuseableRetrieverResources() *reuseableRetrieverResources {
+	return &reuseableRetrieverResources{}
+}
+
+func (r *reuseableRetrieverResources) resetAll() {
+	r.resetDataReqs()
+	r.resetWideEntryReqs()
+}
+
+func (r *reuseableRetrieverResources) resetDataReqs() {
+	for i := range r.dataReqs {
+		r.dataReqs[i] = nil
+	}
+	r.dataReqs = r.dataReqs[:0]
+}
+
+func (r *reuseableRetrieverResources) resetWideEntryReqs() {
+	for i := range r.wideEntryReqs {
+		r.wideEntryReqs[i] = nil
+	}
+	r.wideEntryReqs = r.wideEntryReqs[:0]
+}
+
+func (r *reuseableRetrieverResources) appendWideEntryReq(
+	req *retrieveRequest,
+) {
+	r.wideEntryReqs = append(r.wideEntryReqs, req)
+}
diff --git a/src/dbnode/persist/fs/retriever_test.go b/src/dbnode/persist/fs/retriever_test.go
index 824e012c96..07754dc90e 100644
--- a/src/dbnode/persist/fs/retriever_test.go
+++ b/src/dbnode/persist/fs/retriever_test.go
@@ -1,5 +1,3 @@
-// +build big
-//
 // Copyright (c) 2016 Uber Technologies, Inc.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -772,8 +770,6 @@ func testBlockRetrieverOnRetrieve(t *testing.T, globalFlag bool, nsFlag bool) {
 	} else {
 		require.False(t, onRetrieveCalled)
 	}
-
-	require.NoError(t, err)
 }
 
 // TestBlockRetrieverHandlesErrors verifies the behavior of the Stream() method
diff --git a/src/dbnode/persist/fs/seek.go b/src/dbnode/persist/fs/seek.go
index 5fc175fcf1..956884134b 100644
--- a/src/dbnode/persist/fs/seek.go
+++ b/src/dbnode/persist/fs/seek.go
@@ -31,7 +31,6 @@ import (
 
 	"github.com/m3db/m3/src/dbnode/digest"
 	xmsgpack "github.com/m3db/m3/src/dbnode/persist/fs/msgpack"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/persist/schema"
 	"github.com/m3db/m3/src/dbnode/x/xio"
 	"github.com/m3db/m3/src/x/checked"
@@ -52,6 +51,9 @@ var (
 	// errSeekChecksumMismatch returned when data checksum does not match the expected checksum
 	errSeekChecksumMismatch = errors.New("checksum does not match expected checksum")
 
+	// errSeekNotCompleted returned when no error but seek did not complete.
+	errSeekNotCompleted = errors.New("seek not completed")
+
 	// errClonesShouldNotBeOpened returned when Open() is called on a clone
 	errClonesShouldNotBeOpened = errors.New("clone should not be opened")
 )
@@ -374,87 +376,6 @@ func (s *seeker) SeekByIndexEntry(
 	return buffer, nil
 }
 
-func (s *seeker) SeekReadMismatchesByIndexChecksum(
-	checksum xio.IndexChecksum,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	resources ReusableSeekerResources,
-) (wide.ReadMismatch, error) {
-	completed := false
-	defer func() {
-		// NB: if this fails to complete, finalize the checksum.
-		if !completed {
-			checksum.Finalize()
-		}
-	}()
-
-	mismatchChecker.Lock()
-	// NB: first, apply the reader.
-	allMismatches, err := mismatchChecker.ComputeMismatchesForEntry(checksum)
-	if err != nil {
-		// NB: free checksum resources
-		return wide.ReadMismatch{}, err
-	}
-
-	// NB: only filter out reader side mismatches. TODO: remove index checksum
-	// mismatches, since they are not necessary in the updated model.
-	mismatches := allMismatches[:0]
-	for _, m := range allMismatches {
-		if m.IsReaderMismatch() {
-			mismatches = append(mismatches, m)
-		}
-	}
-
-	if len(mismatches) == 0 {
-		// This entry matches; no need to retrieve data.
-		return wide.ReadMismatch{}, nil
-	}
-
-	if len(mismatches) > 1 {
-		return wide.ReadMismatch{}, fmt.Errorf("multiple reader mismatches")
-	}
-
-	mismatchChecker.Unlock()
-	resources.offsetFileReader.reset(s.dataFd, checksum.Offset)
-
-	// Obtain an appropriately sized buffer.
-	var buffer checked.Bytes
-	if s.opts.bytesPool != nil {
-		buffer = s.opts.bytesPool.Get(int(checksum.Size))
-		buffer.IncRef()
-		defer buffer.DecRef()
-		buffer.Resize(int(checksum.Size))
-	} else {
-		buffer = checked.NewBytes(make([]byte, checksum.Size), nil)
-		buffer.IncRef()
-		defer buffer.DecRef()
-	}
-
-	// Copy the actual data into the underlying buffer.
-	underlyingBuf := buffer.Bytes()
-	n, err := io.ReadFull(resources.offsetFileReader, underlyingBuf)
-	if err != nil {
-		return wide.ReadMismatch{}, err
-	}
-	if n != int(checksum.Size) {
-		// This check is redundant because io.ReadFull will return an error if
-		// its not able to read the specified number of bytes, but we keep it
-		// in for posterity.
-		return wide.ReadMismatch{}, fmt.Errorf("tried to read: %d bytes but read: %d", checksum.Size, n)
-	}
-
-	// NB(r): _must_ check the checksum against known checksum as the data
-	// file might not have been verified if we haven't read through the file yet.
-	if checksum.DataChecksum != int64(digest.Checksum(underlyingBuf)) {
-		return wide.ReadMismatch{}, errSeekChecksumMismatch
-	}
-
-	completed = true
-	return wide.ReadMismatch{
-		IndexChecksum: checksum,
-		Data:          buffer,
-	}, nil
-}
-
 // SeekIndexEntry performs the following steps:
 //
 //     1. Go to the indexLookup and it will give us an offset that is a good starting
@@ -544,25 +465,25 @@ func (s *seeker) SeekIndexEntry(
 	}
 }
 
-// SeekIndexEntryToIndexChecksum performs the following steps:
+// SeekWideEntry performs the following steps:
 //
 //     1. Go to the indexLookup and it will give us an offset that is a good starting
 //        point for scanning the index file.
 //     2. Reset an offsetFileReader with the index fd and an offset (so that calls to Read() will
 //        begin at the offset provided by the offset lookup).
 //     3. Reset a decoder with fileDecoderStream (offsetFileReader wrapped in a bufio.Reader).
-//     4. Call DecodeIndexEntry in a tight loop (which will advance our position in the
+//     4. Call DecodeToWideEntry in a tight loop (which will advance our position in the
 //        offsetFileReader internally) until we've either found the entry we're looking for or gone so
 //        far we know it does not exist.
-func (s *seeker) SeekIndexEntryToIndexChecksum(
+func (s *seeker) SeekWideEntry(
 	id ident.ID,
 	resources ReusableSeekerResources,
-) (xio.IndexChecksum, error) {
+) (xio.WideEntry, error) {
 	offset, err := s.indexLookup.getNearestIndexFileOffset(id, resources)
 	// Should never happen, either something is really wrong with the code or
 	// the file on disk was corrupted.
 	if err != nil {
-		return xio.IndexChecksum{}, err
+		return xio.WideEntry{}, err
 	}
 
 	resources.offsetFileReader.reset(s.indexFd, offset)
@@ -571,39 +492,39 @@ func (s *seeker) SeekIndexEntryToIndexChecksum(
 
 	idBytes := id.Bytes()
 	for {
-		checksum, status, err := resources.xmsgpackDecoder.
-			DecodeIndexEntryToIndexChecksum(idBytes, resources.decodeIndexEntryBytesPool)
+		entry, status, err := resources.xmsgpackDecoder.
+			DecodeToWideEntry(idBytes, resources.decodeIndexEntryBytesPool)
 		if err != nil {
 			// No longer being used so we can return to the pool.
-			resources.decodeIndexEntryBytesPool.Put(checksum.ID)
-			resources.decodeIndexEntryBytesPool.Put(checksum.EncodedTags)
+			resources.decodeIndexEntryBytesPool.Put(entry.ID)
+			resources.decodeIndexEntryBytesPool.Put(entry.EncodedTags)
 
 			if err == io.EOF {
 				// Reached the end of the file without finding the ID.
-				return xio.IndexChecksum{}, errSeekIDNotFound
+				return xio.WideEntry{}, errSeekIDNotFound
 			}
 			// Should never happen, either something is really wrong with the code or
 			// the file on disk was corrupted.
-			return xio.IndexChecksum{}, instrument.InvariantErrorf(err.Error())
+			return xio.WideEntry{}, instrument.InvariantErrorf(err.Error())
 		}
 
 		if status != xmsgpack.MatchedLookupStatus {
 			// No longer being used so we can return to the pool.
-			resources.decodeIndexEntryBytesPool.Put(checksum.ID)
-			resources.decodeIndexEntryBytesPool.Put(checksum.EncodedTags)
+			resources.decodeIndexEntryBytesPool.Put(entry.ID)
+			resources.decodeIndexEntryBytesPool.Put(entry.EncodedTags)
 
 			if status == xmsgpack.NotFoundLookupStatus {
-				// a `NotFound` status for the index checksum decode indicates that the
+				// a `NotFound` status for the wide entry decode indicates that the
 				// current seek has passed the point in the file where this ID could have
 				// appeared; short-circuit here as the ID does not exist in the file.
-				return xio.IndexChecksum{}, errSeekIDNotFound
+				return xio.WideEntry{}, errSeekIDNotFound
 			} else if status == xmsgpack.MismatchLookupStatus {
-				// a `Mismatch` status for the index checksum decode indicates that the
+				// a `Mismatch` status for the wide entry decode indicates that the
 				// current seek does not match the ID, but that it may still appear in
 				// the file.
 				continue
 			} else if status == xmsgpack.ErrorLookupStatus {
-				return xio.IndexChecksum{}, errors.New("unknown index lookup error")
+				return xio.WideEntry{}, errors.New("unknown index lookup error")
 			}
 		}
 
@@ -611,23 +532,23 @@ func (s *seeker) SeekIndexEntryToIndexChecksum(
 		// so they can be passed along. We use the "real" bytes pool here
 		// because we're passing ownership of the bytes to the entry / caller.
 		var checkedEncodedTags checked.Bytes
-		if tags := checksum.EncodedTags; len(tags) > 0 {
+		if tags := entry.EncodedTags; len(tags) > 0 {
 			checkedEncodedTags = s.opts.bytesPool.Get(len(tags))
 			checkedEncodedTags.IncRef()
 			checkedEncodedTags.AppendAll(tags)
 		}
 
 		// No longer being used so we can return to the pool.
-		resources.decodeIndexEntryBytesPool.Put(checksum.ID)
-		resources.decodeIndexEntryBytesPool.Put(checksum.EncodedTags)
+		resources.decodeIndexEntryBytesPool.Put(entry.ID)
+		resources.decodeIndexEntryBytesPool.Put(entry.EncodedTags)
 
-		return xio.IndexChecksum{
+		return xio.WideEntry{
 			ID:               id,
-			Size:             checksum.Size,
-			Offset:           checksum.Offset,
-			DataChecksum:     checksum.DataChecksum,
+			Size:             entry.Size,
+			Offset:           entry.Offset,
+			DataChecksum:     entry.DataChecksum,
 			EncodedTags:      checkedEncodedTags,
-			MetadataChecksum: checksum.MetadataChecksum,
+			MetadataChecksum: entry.MetadataChecksum,
 		}, nil
 	}
 }
diff --git a/src/dbnode/persist/fs/types.go b/src/dbnode/persist/fs/types.go
index 55a3e70874..4db0c0c477 100644
--- a/src/dbnode/persist/fs/types.go
+++ b/src/dbnode/persist/fs/types.go
@@ -29,7 +29,6 @@ import (
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist"
 	"github.com/m3db/m3/src/dbnode/persist/fs/msgpack"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/runtime"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/storage/block"
@@ -51,7 +50,7 @@ import (
 	xtime "github.com/m3db/m3/src/x/time"
 )
 
-// FileSetFileIdentifier contains all the information required to identify a FileSetFile
+// FileSetFileIdentifier contains all the information required to identify a FileSetFile.
 type FileSetFileIdentifier struct {
 	FileSetContentType persist.FileSetContentType
 	Namespace          ident.ID
@@ -62,7 +61,7 @@ type FileSetFileIdentifier struct {
 	VolumeIndex int
 }
 
-// DataWriterOpenOptions is the options struct for the Open method on the DataFileSetWriter
+// DataWriterOpenOptions is the options struct for the Open method on the DataFileSetWriter.
 type DataWriterOpenOptions struct {
 	FileSetType        persist.FileSetType
 	FileSetContentType persist.FileSetContentType
@@ -73,13 +72,13 @@ type DataWriterOpenOptions struct {
 }
 
 // DataWriterSnapshotOptions is the options struct for Open method on the DataFileSetWriter
-// that contains information specific to writing snapshot files
+// that contains information specific to writing snapshot files.
 type DataWriterSnapshotOptions struct {
 	SnapshotTime time.Time
 	SnapshotID   []byte
 }
 
-// DataFileSetWriter provides an unsynchronized writer for a TSDB file set
+// DataFileSetWriter provides an unsynchronized writer for a TSDB file set.
 type DataFileSetWriter interface {
 	io.Closer
 
@@ -111,7 +110,7 @@ type SnapshotMetadataFileReader interface {
 	Read(id SnapshotMetadataIdentifier) (SnapshotMetadata, error)
 }
 
-// DataFileSetReaderStatus describes the status of a file set reader
+// DataFileSetReaderStatus describes the status of a file set reader.
 type DataFileSetReaderStatus struct {
 	Namespace  ident.ID
 	BlockStart time.Time
@@ -135,23 +134,7 @@ type DataReaderOpenOptions struct {
 	OptimizedReadMetadataOnly bool
 }
 
-// StreamedChecksum yields a schema.IndexChecksum value asynchronously,
-// and any errors encountered during execution.
-type StreamedChecksum interface {
-	// RetrieveIndexChecksum retrieves the index checksum.
-	RetrieveIndexChecksum() (xio.IndexChecksum, error)
-}
-
-type emptyStreamedChecksum struct{}
-
-func (emptyStreamedChecksum) RetrieveIndexChecksum() (xio.IndexChecksum, error) {
-	return xio.IndexChecksum{}, nil
-}
-
-// EmptyStreamedChecksum is an empty streamed checksum.
-var EmptyStreamedChecksum StreamedChecksum = emptyStreamedChecksum{}
-
-// DataFileSetReader provides an unsynchronized reader for a TSDB file set
+// DataFileSetReader provides an unsynchronized reader for a TSDB file set.
 type DataFileSetReader interface {
 	io.Closer
 
@@ -167,8 +150,8 @@ type DataFileSetReader interface {
 	// them so they can be returned to their respective pools.
 	Read() (id ident.ID, tags ident.TagIterator, data checked.Bytes, checksum uint32, err error)
 
-	// StreamingRead returns the next unpooled id, encodedTags, data, checksum values ordered by id,
-	// or error, will return io.EOF at end of volume.
+	// StreamingRead returns the next unpooled id, encodedTags, data, checksum
+	// values ordered by id, or error, will return io.EOF at end of volume.
 	// Can only by used when DataReaderOpenOptions.StreamingEnabled is enabled.
 	// Note: the returned id, encodedTags and data get invalidated on the next call to StreamingRead.
 	StreamingRead() (id ident.BytesID, encodedTags ts.EncodedTags, data []byte, checksum uint32, err error)
@@ -183,36 +166,36 @@ type DataFileSetReader interface {
 	// for concurrent use and has a Close() method for releasing resources when done.
 	ReadBloomFilter() (*ManagedConcurrentBloomFilter, error)
 
-	// Validate validates both the metadata and data and returns an error if either is corrupted
+	// Validate validates both the metadata and data and returns an error if either is corrupted.
 	Validate() error
 
-	// ValidateMetadata validates the data and returns an error if the data is corrupted
+	// ValidateMetadata validates the data and returns an error if the data is corrupted.
 	ValidateMetadata() error
 
-	// ValidateData validates the data and returns an error if the data is corrupted
+	// ValidateData validates the data and returns an error if the data is corrupted.
 	ValidateData() error
 
-	// Range returns the time range associated with data in the volume
+	// Range returns the time range associated with data in the volume.
 	Range() xtime.Range
 
-	// Entries returns the count of entries in the volume
+	// Entries returns the count of entries in the volume.
 	Entries() int
 
-	// EntriesRead returns the position read into the volume
+	// EntriesRead returns the position read into the volume.
 	EntriesRead() int
 
-	// MetadataRead returns the position of metadata read into the volume
+	// MetadataRead returns the position of metadata read into the volume.
 	MetadataRead() int
 
-	// StreamingEnabled returns true if the reader is opened in streaming mode
+	// StreamingEnabled returns true if the reader is opened in streaming mode.
 	StreamingEnabled() bool
 }
 
-// DataFileSetSeeker provides an out of order reader for a TSDB file set
+// DataFileSetSeeker provides an out of order reader for a TSDB file set.
 type DataFileSetSeeker interface {
 	io.Closer
 
-	// Open opens the files for the given shard and version for reading
+	// Open opens the files for the given shard and version for reading.
 	Open(
 		namespace ident.ID,
 		shard uint32,
@@ -230,23 +213,15 @@ type DataFileSetSeeker interface {
 	// entry and don't want to waste resources looking it up again.
 	SeekByIndexEntry(entry IndexEntry, resources ReusableSeekerResources) (checked.Bytes, error)
 
-	// SeekReadMismatchesByIndexChecksum seeks in a manner similar to
-	// SeekIndexByEntry, checking against a set of streamed index checksums.
-	SeekReadMismatchesByIndexChecksum(
-		checksum xio.IndexChecksum,
-		mismatchChecker wide.EntryChecksumMismatchChecker,
-		resources ReusableSeekerResources,
-	) (wide.ReadMismatch, error)
-
 	// SeekIndexEntry returns the IndexEntry for the specified ID. This can be useful
 	// ahead of issuing a number of seek requests so that the seek requests can be
 	// made in order. The returned IndexEntry can also be passed to SeekUsingIndexEntry
 	// to prevent duplicate index lookups.
 	SeekIndexEntry(id ident.ID, resources ReusableSeekerResources) (IndexEntry, error)
 
-	// SeekIndexEntryToIndexChecksum seeks in a manner similar to SeekIndexEntry, but
-	// instead yields a minimal structure describing a checksum of the series.
-	SeekIndexEntryToIndexChecksum(id ident.ID, resources ReusableSeekerResources) (xio.IndexChecksum, error)
+	// SeekWideEntry seeks in a manner similar to SeekIndexEntry, but
+	// instead yields a wide entry checksum of the series.
+	SeekWideEntry(id ident.ID, resources ReusableSeekerResources) (xio.WideEntry, error)
 
 	// Range returns the time range associated with data in the volume
 	Range() xtime.Range
@@ -280,18 +255,11 @@ type ConcurrentDataFileSetSeeker interface {
 	// SeekByIndexEntry is the same as in DataFileSetSeeker.
 	SeekByIndexEntry(entry IndexEntry, resources ReusableSeekerResources) (checked.Bytes, error)
 
-	// SeekReadMismatchesByIndexChecksum is the same as in DataFileSetSeeker.
-	SeekReadMismatchesByIndexChecksum(
-		checksum xio.IndexChecksum,
-		mismatchChecker wide.EntryChecksumMismatchChecker,
-		resources ReusableSeekerResources,
-	) (wide.ReadMismatch, error)
-
 	// SeekIndexEntry is the same as in DataFileSetSeeker.
 	SeekIndexEntry(id ident.ID, resources ReusableSeekerResources) (IndexEntry, error)
 
-	// SeekIndexEntryToIndexChecksum is the same as in DataFileSetSeeker.
-	SeekIndexEntryToIndexChecksum(id ident.ID, resources ReusableSeekerResources) (xio.IndexChecksum, error)
+	// SeekWideEntry is the same as in DataFileSetSeeker.
+	SeekWideEntry(id ident.ID, resources ReusableSeekerResources) (xio.WideEntry, error)
 
 	// ConcurrentIDBloomFilter is the same as in DataFileSetSeeker.
 	ConcurrentIDBloomFilter() *ManagedConcurrentBloomFilter
@@ -327,19 +295,19 @@ type DataFileSetSeekerManager interface {
 	Test(id ident.ID, shard uint32, start time.Time) (bool, error)
 }
 
-// DataBlockRetriever provides a block retriever for TSDB file sets
+// DataBlockRetriever provides a block retriever for TSDB file sets.
 type DataBlockRetriever interface {
 	io.Closer
 	block.DatabaseBlockRetriever
 
-	// Open the block retriever to retrieve from a namespace
+	// Open the block retriever to retrieve from a namespace.
 	Open(
 		md namespace.Metadata,
 		shardSet sharding.ShardSet,
 	) error
 }
 
-// RetrievableDataBlockSegmentReader is a retrievable block reader
+// RetrievableDataBlockSegmentReader is a retrievable block reader.
 type RetrievableDataBlockSegmentReader interface {
 	xio.SegmentReader
 }
@@ -464,7 +432,7 @@ type Options interface {
 	IndexSummariesPercent() float64
 
 	// SetIndexBloomFilterFalsePositivePercent size sets the percent of false positive
-	// rate to use for the index bloom filter size and k hashes estimation
+	// rate to use for the index bloom filter size and k hashes estimation.
 	SetIndexBloomFilterFalsePositivePercent(value float64) Options
 
 	// IndexBloomFilterFalsePositivePercent size returns the percent of false positive
@@ -493,10 +461,12 @@ type Options interface {
 	// WriterBufferSize returns the buffer size for writing TSDB files.
 	WriterBufferSize() int
 
-	// SetInfoReaderBufferSize sets the buffer size for reading TSDB info, digest and checkpoint files.
+	// SetInfoReaderBufferSize sets the buffer size for reading TSDB info,
+	// digest and checkpoint files.
 	SetInfoReaderBufferSize(value int) Options
 
-	// InfoReaderBufferSize returns the buffer size for reading TSDB info, digest and checkpoint files.
+	// InfoReaderBufferSize returns the buffer size for reading TSDB info,
+	// digest and checkpoint files.
 	InfoReaderBufferSize() int
 
 	// SetDataReaderBufferSize sets the buffer size for reading TSDB data and index files.
@@ -568,7 +538,7 @@ type Options interface {
 	EncodingOptions() msgpack.LegacyEncodingOptions
 }
 
-// BlockRetrieverOptions represents the options for block retrieval
+// BlockRetrieverOptions represents the options for block retrieval.
 type BlockRetrieverOptions interface {
 	// Validate validates the options.
 	Validate() error
@@ -653,9 +623,10 @@ type Merger interface {
 		onFlush persist.OnFlushSeries,
 	) (persist.DataCloser, error)
 
-	// MergeAndCleanup merges the specified fileset file with a merge target and removes the previous version of the
-	// fileset. This should only be called within the bootstrapper. Any other file deletions outside of the bootstrapper
-	// should be handled by the CleanupManager.
+	// MergeAndCleanup merges the specified fileset file with a merge target and
+	// removes the previous version of the fileset. This should only be called
+	// within the bootstrapper. Any other file deletions outside of the
+	// bootstrapper should be handled by the CleanupManager.
 	MergeAndCleanup(
 		fileID FileSetFileIdentifier,
 		mergeWith MergeWith,
@@ -695,20 +666,21 @@ type BlockRecord struct {
 	DataChecksum uint32
 }
 
-// CrossBlockReader allows reading data (encoded bytes) from multiple DataFileSetReaders of the same shard,
-// ordered by series id first, and block start time next.
+// CrossBlockReader allows reading data (encoded bytes) from multiple
+// DataFileSetReaders of the same shard, ordered lexographically by series ID,
+// then by block time.
 type CrossBlockReader interface {
 	io.Closer
 
-	// Next advances to the next data record and returns true, or returns false if no more data exists.
+	// Next advances to the next data record, returning true if it exists.
 	Next() bool
 
 	// Err returns the last error encountered (if any).
 	Err() error
 
-	// Current returns distinct series id and encodedTags, plus a slice with data and checksums from all
-	// blocks corresponding to that series (in temporal order).
-	// id, encodedTags, records slice and underlying data are being invalidated on each call to Next().
+	// Current returns distinct series id and encodedTags, plus a slice with data
+	// and checksums from all blocks corresponding to that series (in temporal order).
+	// ID, encodedTags, records, and underlying data are invalidated on each call to Next().
 	Current() (id ident.BytesID, encodedTags ts.EncodedTags, records []BlockRecord)
 }
 
diff --git a/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker.go b/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker.go
deleted file mode 100644
index 1b5d04324f..0000000000
--- a/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker.go
+++ /dev/null
@@ -1,280 +0,0 @@
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE
-
-package wide
-
-import (
-	"bytes"
-	"fmt"
-	"sync"
-
-	"github.com/m3db/m3/src/dbnode/persist/fs/msgpack"
-	"github.com/m3db/m3/src/dbnode/persist/schema"
-	"github.com/m3db/m3/src/dbnode/x/xio"
-	"github.com/m3db/m3/src/x/instrument"
-
-	"go.uber.org/zap"
-)
-
-type entryWithChecksum struct {
-	idChecksum int64
-	entry      schema.IndexEntry
-}
-
-type entryChecksumMismatchChecker struct {
-	mu sync.Mutex
-
-	blockReader  IndexChecksumBlockBatchReader
-	mismatches   []ReadMismatch
-	strictLastID []byte
-
-	decodeOpts msgpack.DecodingOptions
-	iOpts      instrument.Options
-
-	batchIdx  int
-	exhausted bool
-	started   bool
-}
-
-// FIXME: remove once this is changed to single output.
-func (c *entryChecksumMismatchChecker) Lock() {
-	c.mu.Lock()
-}
-
-// FIXME: remove once this is changed to single output.
-func (c *entryChecksumMismatchChecker) Unlock() {
-	c.mu.Unlock()
-}
-
-// NewEntryChecksumMismatchChecker creates a new entry checksum mismatch
-// checker, backed by the given block reader.
-// NB: index entries MUST be checked in lexicographical order by ID.
-func NewEntryChecksumMismatchChecker(
-	blockReader IndexChecksumBlockBatchReader,
-	opts Options,
-) EntryChecksumMismatchChecker {
-	return &entryChecksumMismatchChecker{
-		blockReader: blockReader,
-		mismatches:  make([]ReadMismatch, 0, opts.BatchSize()),
-		decodeOpts:  opts.DecodingOptions(),
-		iOpts:       opts.InstrumentOptions(),
-	}
-}
-
-func checksumMismatch(checksum xio.IndexChecksum) ReadMismatch {
-	return ReadMismatch{
-		IndexChecksum: checksum,
-	}
-}
-
-func (c *entryChecksumMismatchChecker) checksumMismatches(
-	checksums ...xio.IndexChecksum,
-) []ReadMismatch {
-	for _, checksum := range checksums {
-		c.mismatches = append(c.mismatches, checksumMismatch(checksum))
-	}
-
-	return c.mismatches
-}
-
-func (c *entryChecksumMismatchChecker) recordIndexMismatches(checksums ...int64) {
-	for _, checksum := range checksums {
-		c.mismatches = append(c.mismatches, ReadMismatch{
-			IndexChecksum: xio.IndexChecksum{
-				MetadataChecksum: checksum,
-			},
-		})
-	}
-}
-
-func (c *entryChecksumMismatchChecker) emitInvariantViolation(
-	marker []byte,
-	checksum int64,
-	entry xio.IndexChecksum,
-) error {
-	// Checksums match but IDs do not. Treat as an invariant violation.
-	err := fmt.Errorf("checksum collision")
-	instrument.EmitAndLogInvariantViolation(c.iOpts, func(l *zap.Logger) {
-		l.Error(
-			err.Error(),
-			zap.Int64("checksum", checksum),
-			zap.Binary("marker", marker),
-			zap.Any("entry", entry),
-		)
-	})
-	return err
-}
-
-func (c *entryChecksumMismatchChecker) readNextBatch() IndexChecksumBlockBatch {
-	if !c.blockReader.Next() {
-		c.exhausted = true
-		// NB: set exhausted to true and return an empty since there are no
-		// more available checksum blocks.
-		return IndexChecksumBlockBatch{}
-	}
-
-	c.batchIdx = 0
-	batch := c.blockReader.Current()
-	return batch
-}
-
-func (c *entryChecksumMismatchChecker) ComputeMismatchesForEntry(
-	entry xio.IndexChecksum,
-) ([]ReadMismatch, error) {
-	c.mismatches = c.mismatches[:0]
-	if c.exhausted {
-		// NB: no remaining batches in the index checksum block; any further
-		// elements are mismatches (missing from primary).
-		return c.checksumMismatches(entry), nil
-	}
-
-	if !c.started {
-		c.started = true
-		if !c.blockReader.Next() {
-			// NB: no index checksum blocks available; any further
-			// elements are mismatches (missing from primary).
-			c.exhausted = true
-			return c.checksumMismatches(entry), nil
-		}
-
-		c.batchIdx = 0
-	}
-
-	batch := c.blockReader.Current()
-	for {
-		markerIdx := len(batch.Checksums) - 1
-
-		// NB: If the incoming checksum block is empty, move to the next one.
-		if len(batch.Checksums) == 0 {
-			batch = c.readNextBatch()
-			if c.exhausted {
-				return c.mismatches, nil
-			}
-
-			continue
-		}
-
-		checksum := batch.Checksums[c.batchIdx]
-		markerCompare := bytes.Compare(batch.EndMarker, entry.ID.Bytes())
-		if c.batchIdx < markerIdx {
-			if checksum == entry.MetadataChecksum {
-				// Matches: increment batch index and return any gathered mismatches.
-				c.batchIdx++
-				return c.mismatches, nil
-			}
-
-			for nextBatchIdx := c.batchIdx + 1; nextBatchIdx < markerIdx; nextBatchIdx++ {
-				// NB: read next hashes, checking for index checksum matches.
-				nextChecksum := batch.Checksums[nextBatchIdx]
-				if entry.MetadataChecksum != nextChecksum {
-					continue
-				}
-
-				// Checksum match. Add previous checksums as mismatches.
-				c.recordIndexMismatches(batch.Checksums[c.batchIdx:nextBatchIdx]...)
-				c.batchIdx = nextBatchIdx + 1
-				return c.mismatches, nil
-			}
-
-			checksum = batch.Checksums[markerIdx]
-			// NB: this is the last element in the batch. Check ID against MARKER.
-			if entry.MetadataChecksum == checksum {
-				if markerCompare != 0 {
-					// Checksums match but IDs do not. Treat as emitInvariantViolation violation.
-					return nil, c.emitInvariantViolation(batch.EndMarker, checksum, entry)
-				}
-
-				c.recordIndexMismatches(batch.Checksums[c.batchIdx:markerIdx]...)
-				// ID and checksum match. Advance the block iter and return empty.
-				batch = c.readNextBatch()
-				return c.mismatches, nil
-			}
-
-			// Checksums do not match.
-			if markerCompare > 0 {
-				// This is a mismatch on primary that appears before the
-				// marker element. Return mismatch but do not advance iter.
-				return c.checksumMismatches(entry), nil
-			}
-
-			// Current value is past the end of this batch. Mark all in batch as
-			// mismatches, and receive next batch.
-			c.recordIndexMismatches(batch.Checksums[c.batchIdx:]...)
-			batch = c.readNextBatch()
-			if c.exhausted {
-				// If no further values, add the current entry as a mismatch and return.
-				return c.checksumMismatches(entry), nil
-			}
-
-			// All mismatches marked for the current batch, check entry against next
-			// batch.
-			continue
-		}
-
-		// NB: this is the last element in the batch. Check ID against MARKER.
-		if entry.MetadataChecksum == checksum {
-			if markerCompare != 0 {
-				// Checksums match but IDs do not. Treat as emitInvariantViolation violation.
-				return nil, c.emitInvariantViolation(batch.EndMarker, checksum, entry)
-			}
-
-			// ID and checksum match. Advance the block iter and return gathered mismatches.
-			batch = c.readNextBatch()
-			return c.mismatches, nil
-		}
-
-		// Checksum mismatch.
-		if markerCompare == 0 {
-			// IDs match but checksums do not. Advance the block iter and return
-			// mismatch.
-			batch = c.readNextBatch()
-			return c.checksumMismatches(entry), nil
-		} else if markerCompare > 0 {
-			// This is a mismatch on primary that appears before the
-			// marker element. Return mismatch but do not advance iter.
-			return c.checksumMismatches(entry), nil
-		}
-
-		// The current batch here is exceeded. Emit the current batch marker as
-		// a mismatch on primary, and advance the block iter.
-		c.recordIndexMismatches(checksum)
-		batch = c.readNextBatch()
-		if c.exhausted {
-			// If no further values, add the current entry as a mismatch and return.
-			return c.checksumMismatches(entry), nil
-		}
-	}
-}
-
-func (c *entryChecksumMismatchChecker) Drain() []ReadMismatch {
-	if c.exhausted {
-		return nil
-	}
-
-	c.mismatches = c.mismatches[:0]
-	curr := c.blockReader.Current()
-	c.recordIndexMismatches(curr.Checksums[c.batchIdx:]...)
-	for c.blockReader.Next() {
-		curr := c.blockReader.Current()
-		c.recordIndexMismatches(curr.Checksums...)
-	}
-
-	return c.mismatches
-}
diff --git a/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker_prop_test.go b/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker_prop_test.go
deleted file mode 100644
index ef213871ce..0000000000
--- a/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker_prop_test.go
+++ /dev/null
@@ -1,463 +0,0 @@
-// +build big
-//
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-package wide
-
-import (
-	"fmt"
-	"os"
-	"sort"
-	"testing"
-	"time"
-
-	"github.com/m3db/m3/src/dbnode/persist/fs/msgpack"
-	"github.com/m3db/m3/src/dbnode/persist/schema"
-	"github.com/m3db/m3/src/dbnode/x/xio"
-	"github.com/m3db/m3/src/x/checked"
-	"github.com/m3db/m3/src/x/ident"
-	xhash "github.com/m3db/m3/src/x/test/hash"
-
-	"github.com/leanovate/gopter"
-	"github.com/leanovate/gopter/gen"
-	"github.com/leanovate/gopter/prop"
-)
-
-func generateRawChecksums(size int, opts Options) []xio.IndexChecksum {
-	checksums := make([]xio.IndexChecksum, size)
-	indexHasher := opts.DecodingOptions().IndexEntryHasher()
-	for i := range checksums {
-		idStr := fmt.Sprintf("id-%03d", i)
-		tags := []byte(fmt.Sprintf("tags-%03d", i))
-
-		entry := schema.IndexEntry{
-			ID:          []byte(idStr),
-			EncodedTags: tags,
-		}
-
-		checksums[i] = xio.IndexChecksum{
-			ID:               ident.StringID(idStr),
-			EncodedTags:      checked.NewBytes(tags, checked.NewBytesOptions()),
-			MetadataChecksum: indexHasher.HashIndexEntry(entry),
-		}
-	}
-
-	return checksums
-}
-
-type generatedEntries struct {
-	taking  []bool
-	entries []xio.IndexChecksum
-}
-
-// genEntryTestInput creates a list of indexChecksums,
-// dropping a certain percentage.
-func genEntryTestInput(size int, opts Options) gopter.Gen {
-	entries := generateRawChecksums(size, opts)
-
-	return gopter.CombineGens(
-		// NB: This generator controls if the element should be removed
-		gen.SliceOfN(len(entries), gen.IntRange(0, 100)),
-	).Map(func(val []interface{}) generatedEntries {
-		var (
-			dropChancePercent = val[0].([]int)
-
-			taking       []bool
-			takenEntries []xio.IndexChecksum
-		)
-
-		for i, chance := range dropChancePercent {
-			shouldKeep := chance <= 80
-			taking = append(taking, shouldKeep)
-			if shouldKeep {
-				takenEntries = append(takenEntries, entries[i])
-			}
-		}
-
-		return generatedEntries{taking: taking, entries: takenEntries}
-	})
-}
-
-type generatedChecksums struct {
-	taking     []bool
-	blockBatch []IndexChecksumBlockBatch
-}
-
-// genChecksumTestInput creates index checksum blockBatch of randomized sizes,
-// dropping a certain percentage of index checksums.
-func genChecksumTestInput(size int, opts Options) gopter.Gen {
-	entries := generateRawChecksums(size, opts)
-
-	return gopter.CombineGens(
-		// NB: This generator controls if the element should be removed
-		gen.SliceOfN(len(entries), gen.IntRange(0, 100)),
-		// NB: This generator controls how large each batch will be
-		gen.SliceOfN(len(entries), gen.IntRange(1, len(entries))),
-	).Map(func(val []interface{}) generatedChecksums {
-		var (
-			dropChancePercent = val[0].([]int)
-			blockSizes        = val[1].([]int)
-
-			taking         []bool
-			takenChecksums []xio.IndexChecksum
-			checksumBlocks []IndexChecksumBlockBatch
-		)
-
-		for i, chance := range dropChancePercent {
-			shouldKeep := chance <= 80
-			taking = append(taking, shouldKeep)
-			if shouldKeep {
-				takenChecksums = append(takenChecksums, xio.IndexChecksum{
-					ID:               entries[i].ID,
-					MetadataChecksum: entries[i].MetadataChecksum,
-				})
-			}
-		}
-
-		for _, blockSize := range blockSizes {
-			remaining := len(takenChecksums)
-			if remaining == 0 {
-				break
-			}
-
-			take := blockSize
-			if remaining < take {
-				take = remaining
-			}
-
-			block := IndexChecksumBlockBatch{
-				Checksums: make([]int64, 0, take),
-			}
-
-			for i := 0; i < take; i++ {
-				block.Checksums = append(block.Checksums, takenChecksums[i].MetadataChecksum)
-				block.EndMarker = takenChecksums[i].ID.Bytes()
-			}
-
-			takenChecksums = takenChecksums[take:]
-			checksumBlocks = append(checksumBlocks, block)
-		}
-
-		return generatedChecksums{taking: taking, blockBatch: checksumBlocks}
-	})
-}
-
-type mismatchChecksumBatch struct {
-	lastElementMarker bool
-	mismatches        []mismatchChecksum
-}
-
-func (b *mismatchChecksumBatch) gatherContiguousMismatchValues() {
-	var (
-		checksumSet         bool
-		hasEntryMismatch    bool
-		hasChecksumMismatch bool
-		contiguousCount     int
-		nextContiguous      int64
-	)
-
-	for idx, mismatchChecksum := range b.mismatches {
-		var (
-			lastIsContiguous bool
-
-			checksum = mismatchChecksum.checksum
-			isLast   = idx == len(b.mismatches)-1
-		)
-
-		// NB: gather the number of contiguous mismatches. Mismatches are contiguous
-		// if they appear one after another, with no matching entries between them.
-		if !checksumSet || checksum == nextContiguous {
-			checksumSet = true
-
-			if mismatchChecksum.entryMismatch {
-				hasEntryMismatch = true
-			} else {
-				hasChecksumMismatch = true
-			}
-
-			contiguousCount++
-			if !isLast {
-				// If this is not the last mismatch, increase the contiguous length.
-				nextContiguous = checksum + 1
-				continue
-			} else {
-				lastIsContiguous = true
-			}
-		}
-
-		// A contiguous set of mismatches should be sorted IFF:
-		//  - at least 2 values
-		//  - contiguous set contains both entry and checksum mismatches
-		// After sorting, all entry mismatches should appear first, in
-		// increasing order, followed by index mismatches in increasing order.
-		// NB: if the last element of a batch is a mismatch, it is fixed and should
-		// not be sorted.
-		if contiguousCount > 1 && hasEntryMismatch && hasChecksumMismatch {
-			firstContiguous := idx - contiguousCount
-			lastContiguous := idx
-			if lastIsContiguous {
-				firstContiguous++
-				if !b.lastElementMarker {
-					lastContiguous++
-				}
-			}
-
-			contiguousSlice := b.mismatches[firstContiguous:lastContiguous]
-			sort.Slice(contiguousSlice, func(i, j int) bool {
-				iEntry, jEntry := contiguousSlice[i], contiguousSlice[j]
-				if iEntry.entryMismatch {
-					if !jEntry.entryMismatch {
-						// entry mismatches always come before checksum mismatches.
-						return true
-					}
-
-					// these should be sorted by lex order
-					return iEntry.checksum < jEntry.checksum
-				}
-
-				if jEntry.entryMismatch {
-					// checksum mismatches always come after entry mismatches.
-					return false
-				}
-
-				// these should be sorted by lex order
-				return iEntry.checksum < jEntry.checksum
-			})
-		}
-
-		// clear
-		contiguousCount = 1
-		hasChecksumMismatch = false
-		hasEntryMismatch = false
-		if mismatchChecksum.entryMismatch {
-			hasEntryMismatch = true
-		} else {
-			hasChecksumMismatch = true
-		}
-
-		nextContiguous = checksum + 1
-	}
-}
-
-func allMismatchChecksumsToMismatchesByBatch(
-	checksums generatedChecksums,
-	allMismatchChecksums []mismatchChecksum,
-) []mismatchChecksumBatch {
-	allMismatchIdx := 0
-	var mismatchBatch []mismatchChecksumBatch
-	for _, batch := range checksums.blockBatch {
-		l := len(batch.Checksums)
-		if l == 0 {
-			continue
-		}
-
-		lastChecksum := batch.Checksums[l-1]
-		lastElementMarker := false
-		var mismatches []mismatchChecksum
-		for _, mismatch := range allMismatchChecksums[allMismatchIdx:] {
-			if mismatch.checksum > lastChecksum {
-				// mismatch past last checksum in batch; append current batch and
-				// start a new one.
-				break
-			}
-
-			mismatches = append(mismatches, mismatch)
-			allMismatchIdx++
-			if mismatch.checksum == lastChecksum {
-				// mismatch is last checksum in batch; append current batch and
-				// start a new one.
-				lastElementMarker = true
-				break
-			}
-		}
-
-		if len(mismatches) == 0 {
-			continue
-		}
-
-		// add a mismatch batch; imporant to note if the last element is a mismatch,
-		// since if it is, it should always remain the last element, regardless of
-		// if it forms a contiguous group or not.
-		mismatchBatch = append(mismatchBatch, mismatchChecksumBatch{
-			lastElementMarker: lastElementMarker,
-			mismatches:        mismatches,
-		})
-	}
-
-	// add any remaining mismatch checksums as a separate batch. This is ok
-	// since they will all be entry mismatches, so no additional sorting will be
-	// performed on this batch.
-	if allMismatchIdx < len(allMismatchChecksums) {
-		mismatchBatch = append(mismatchBatch, mismatchChecksumBatch{
-			lastElementMarker: false,
-			mismatches:        allMismatchChecksums[allMismatchIdx:],
-		})
-	}
-
-	return mismatchBatch
-}
-
-type mismatchChecksum struct {
-	missingOnBoth bool
-	checksum      int64
-	entryMismatch bool
-}
-
-func buildExpectedMismatchChecksums(
-	checksums generatedChecksums,
-	takeEntries []bool,
-) []mismatchChecksum {
-	var allMismatchChecksums []mismatchChecksum
-	takeChecksums := checksums.taking
-	// Collect only elements that don't match.
-	for idx, takeEntry := range takeEntries {
-		if takeEntry != takeChecksums[idx] {
-			allMismatchChecksums = append(allMismatchChecksums, mismatchChecksum{
-				checksum:      int64(idx),
-				entryMismatch: takeEntry,
-			})
-		} else if !takeEntry && !takeChecksums[idx] {
-			// Note checksums missing from both sets; this will be necessary when
-			// checking for congiuous series in gatherContiguousMismatchValues.
-			allMismatchChecksums = append(allMismatchChecksums, mismatchChecksum{
-				missingOnBoth: true,
-				checksum:      int64(idx),
-			})
-		}
-	}
-
-	var gatheredMismatchChecksums []mismatchChecksum
-	// Gather mismatches to match incoming batches.
-	mismatchesByBatch := allMismatchChecksumsToMismatchesByBatch(checksums, allMismatchChecksums)
-	for _, batchMismatches := range mismatchesByBatch {
-		// Sort each batch as will be expected in output.
-		batchMismatches.gatherContiguousMismatchValues()
-
-		// Filter out series which do not appear in either checksum source.
-		filteredMismatches := batchMismatches.mismatches[:0]
-		for _, mismatch := range batchMismatches.mismatches {
-			if !mismatch.missingOnBoth {
-				filteredMismatches = append(filteredMismatches, mismatch)
-			}
-		}
-
-		gatheredMismatchChecksums = append(gatheredMismatchChecksums, filteredMismatches...)
-	}
-
-	return gatheredMismatchChecksums
-}
-
-func TestIndexEntryWideBatchMismatchChecker(t *testing.T) {
-	var (
-		parameters = gopter.DefaultTestParameters()
-		seed       = time.Now().UnixNano()
-		props      = gopter.NewProperties(parameters)
-		reporter   = gopter.NewFormatedReporter(true, 80, os.Stdout)
-
-		hasher       = xhash.NewParsedIndexHasher(t)
-		decodingOpts = msgpack.NewDecodingOptions().SetIndexEntryHasher(hasher)
-		opts         = NewOptions().SetDecodingOptions(decodingOpts)
-
-		size     = 100
-		numTests = 1000
-	)
-
-	parameters.MinSuccessfulTests = numTests
-	parameters.Rng.Seed(seed)
-
-	// NB: capture seed to be able to replicate failed runs.
-	fmt.Println("Running test with seed", seed)
-	props.Property("Checksum mismatcher detects correctly",
-		prop.ForAll(
-			func(
-				genChecksums generatedChecksums,
-				genEntries generatedEntries,
-			) (bool, error) {
-				inputBlockCh := make(chan IndexChecksumBlockBatch)
-				inputBlockReader := NewIndexChecksumBlockBatchReader(inputBlockCh)
-
-				go func() {
-					for _, bl := range genChecksums.blockBatch {
-						inputBlockCh <- bl
-					}
-
-					close(inputBlockCh)
-				}()
-
-				checker := NewEntryChecksumMismatchChecker(inputBlockReader, opts)
-				var readMismatches []ReadMismatch
-				for _, entry := range genEntries.entries {
-					entryMismatches, err := checker.ComputeMismatchesForEntry(entry)
-					if err != nil {
-						return false, fmt.Errorf("failed to compute index entry: %v", err)
-					}
-
-					readMismatches = append(readMismatches, entryMismatches...)
-				}
-
-				readMismatches = append(readMismatches, checker.Drain()...)
-				expectedMismatches := buildExpectedMismatchChecksums(
-					genChecksums, genEntries.taking)
-
-				if len(expectedMismatches) != len(readMismatches) {
-					return false, fmt.Errorf("expected %d expectedMismatches, got %d",
-						len(expectedMismatches), len(readMismatches))
-				}
-
-				for i, expected := range expectedMismatches {
-					actual := readMismatches[i]
-					if actual.MetadataChecksum != expected.checksum {
-						return false, fmt.Errorf("expected checksum %d, got %d at %d",
-							actual.MetadataChecksum, expected.checksum, i)
-					}
-
-					if expected.entryMismatch {
-						expectedTags := fmt.Sprintf("tags-%03d", actual.MetadataChecksum)
-						actual.EncodedTags.IncRef()
-						acTags := string(actual.EncodedTags.Bytes())
-						actual.EncodedTags.DecRef()
-						if acTags != expectedTags {
-							return false, fmt.Errorf("expected tags %s, got %s",
-								expectedTags, acTags)
-						}
-
-						expectedID := fmt.Sprintf("id-%03d", actual.MetadataChecksum)
-						if acID := actual.ID.String(); acID != expectedID {
-							return false, fmt.Errorf("expected tags %s, got %s",
-								expectedID, acID)
-						}
-					} else {
-						if actual.EncodedTags != nil {
-							return false, fmt.Errorf("index mismatch should not have tags")
-						}
-						if actual.ID != nil {
-							return false, fmt.Errorf("index mismatch should not have id")
-						}
-					}
-				}
-
-				return true, nil
-			}, genChecksumTestInput(size, opts), genEntryTestInput(size, opts)))
-
-	if !props.Run(reporter) {
-		t.Errorf("failed with initial seed: %d", seed)
-	}
-}
diff --git a/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker_test.go b/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker_test.go
deleted file mode 100644
index 18a41aadc9..0000000000
--- a/src/dbnode/persist/fs/wide/entry_checksum_mismatch_checker_test.go
+++ /dev/null
@@ -1,375 +0,0 @@
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-package wide
-
-import (
-	"testing"
-	"time"
-
-	"github.com/m3db/m3/src/dbnode/persist/fs/msgpack"
-	"github.com/m3db/m3/src/dbnode/x/xio"
-	"github.com/m3db/m3/src/x/checked"
-	"github.com/m3db/m3/src/x/ident"
-	"github.com/m3db/m3/src/x/instrument"
-	xtest "github.com/m3db/m3/src/x/test"
-	xhash "github.com/m3db/m3/src/x/test/hash"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-func buildTestReader(bls ...IndexChecksumBlockBatch) IndexChecksumBlockBatchReader {
-	ch := make(chan IndexChecksumBlockBatch)
-	reader := NewIndexChecksumBlockBatchReader(ch)
-	go func() {
-		for _, bl := range bls {
-			ch <- bl
-		}
-
-		close(ch)
-	}()
-	return reader
-}
-
-// buildOpts builds default test options. The NewParsedIndexHasher sets
-// the hash value for a xio.IndexEntry as any string represented integer
-// values in the entry ID + entry tags.
-func buildOpts(t *testing.T) Options {
-	decodingOpts := msgpack.NewDecodingOptions().
-		SetIndexEntryHasher(xhash.NewParsedIndexHasher(t))
-	opts := NewOptions().
-		SetBatchSize(2).
-		SetDecodingOptions(decodingOpts).
-		SetInstrumentOptions(instrument.NewOptions())
-	require.NoError(t, opts.Validate())
-	return opts
-}
-
-func toChecksum(id, tags string, checksum int64) xio.IndexChecksum {
-	return xio.IndexChecksum{
-		ID:               ident.StringID(id),
-		EncodedTags:      checked.NewBytes([]byte(tags), checked.NewBytesOptions()),
-		MetadataChecksum: checksum,
-	}
-}
-
-func testIdxMismatch(checksum int64) ReadMismatch {
-	return ReadMismatch{
-		IndexChecksum: xio.IndexChecksum{
-			MetadataChecksum: checksum,
-		},
-	}
-}
-
-func testEntryMismatch(id, tags string, checksum int64) ReadMismatch {
-	return ReadMismatch{
-		IndexChecksum: toChecksum(id, tags, checksum),
-	}
-}
-
-func testMismatches(t *testing.T, expected, actual []ReadMismatch) {
-	require.Equal(t, len(expected), len(actual))
-	for i, ex := range expected {
-		assert.Equal(t, ex.ID, actual[i].ID)
-		assert.Equal(t, ex.Size, actual[i].Size)
-		assert.Equal(t, ex.Offset, actual[i].Offset)
-		assert.Equal(t, ex.DataChecksum, actual[i].DataChecksum)
-		assert.Equal(t, ex.MetadataChecksum, actual[i].MetadataChecksum)
-	}
-}
-
-func TestEmitMismatches(t *testing.T) {
-	ctrl := xtest.NewController(t)
-	defer ctrl.Finish()
-	reader := buildTestReader()
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-	checker, ok := chk.(*entryChecksumMismatchChecker)
-	require.True(t, ok)
-
-	id1, tags1 := "foo1", "encoded-tags1"
-	id2, tags2 := "foo2", "encoded-tags2"
-	id3, tags3 := "foo3", "encoded-tags3"
-	checker.checksumMismatches(toChecksum(id1, tags1, 0), toChecksum(id2, tags2, 1))
-	checker.recordIndexMismatches(100, 200)
-	checker.checksumMismatches(toChecksum(id3, tags3, 2))
-	checker.recordIndexMismatches(300)
-
-	expected := []ReadMismatch{
-		testEntryMismatch(id1, tags1, 0),
-		testEntryMismatch(id2, tags2, 1),
-		testIdxMismatch(100),
-		testIdxMismatch(200),
-		testEntryMismatch(id3, tags3, 2),
-		testIdxMismatch(300),
-	}
-
-	testMismatches(t, expected, checker.mismatches)
-}
-
-func TestComputeMismatchInvariant(t *testing.T) {
-	reader := buildTestReader(IndexChecksumBlockBatch{
-		Checksums: []int64{1},
-		EndMarker: []byte("foo1"),
-	})
-
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-	_, err := chk.ComputeMismatchesForEntry(toChecksum("bar1", "bar", 1))
-	require.Error(t, err)
-}
-
-func TestComputeMismatchInvariantEndOfBlock(t *testing.T) {
-	reader := buildTestReader(IndexChecksumBlockBatch{
-		Checksums: []int64{1, 2},
-		EndMarker: []byte("foo2"),
-	})
-
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-	_, err := chk.ComputeMismatchesForEntry(toChecksum("bar2", "bar", 2))
-	require.Error(t, err)
-}
-
-func assertNoMismatch(
-	t *testing.T,
-	chk EntryChecksumMismatchChecker,
-	checksum xio.IndexChecksum,
-) {
-	mismatch, err := chk.ComputeMismatchesForEntry(checksum)
-	require.NoError(t, err)
-	assert.Equal(t, 0, len(mismatch))
-}
-
-func TestComputeMismatchWithDelayedReader(t *testing.T) {
-	ch := make(chan IndexChecksumBlockBatch)
-	reader := NewIndexChecksumBlockBatchReader(ch)
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-
-	go func() {
-		time.Sleep(time.Millisecond * 100)
-		ch <- IndexChecksumBlockBatch{
-			Checksums: []int64{1},
-			EndMarker: []byte("foo1"),
-		}
-		time.Sleep(time.Millisecond * 200)
-		ch <- IndexChecksumBlockBatch{
-			Checksums: []int64{10},
-			EndMarker: []byte("qux10"),
-		}
-		close(ch)
-	}()
-
-	assertNoMismatch(t, chk, toChecksum("foo1", "bar", 1))
-	assertNoMismatch(t, chk, toChecksum("qux10", "baz", 10))
-	assert.Equal(t, 0, len(chk.Drain()))
-}
-
-func TestComputeMismatchNoMismatch(t *testing.T) {
-	reader := buildTestReader(IndexChecksumBlockBatch{
-		Checksums: []int64{1, 2, 3},
-		EndMarker: []byte("foo3"),
-	}, IndexChecksumBlockBatch{
-		Checksums: []int64{100, 5},
-		EndMarker: []byte("zoo5"),
-	})
-
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-	assertNoMismatch(t, chk, toChecksum("abc1", "aaa", 1))
-	assertNoMismatch(t, chk, toChecksum("def2", "bbb", 2))
-	assertNoMismatch(t, chk, toChecksum("foo3", "ccc", 3))
-	assertNoMismatch(t, chk, toChecksum("qux100", "ddd", 100))
-	assertNoMismatch(t, chk, toChecksum("zoo5", "eee", 5))
-	assert.Equal(t, 0, len(chk.Drain()))
-}
-
-func TestComputeMismatchMismatchesIndexMismatch(t *testing.T) {
-	reader := buildTestReader(IndexChecksumBlockBatch{
-		Checksums: []int64{1, 2, 3},
-		EndMarker: []byte("foo3"),
-	}, IndexChecksumBlockBatch{
-		Checksums: []int64{4, 5},
-		EndMarker: []byte("moo5"),
-	}, IndexChecksumBlockBatch{
-		Checksums: []int64{6, 7, 8},
-		EndMarker: []byte("qux8"),
-	}, IndexChecksumBlockBatch{
-		Checksums: []int64{9, 10},
-		EndMarker: []byte("zzz9"),
-	})
-
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-
-	expected := []ReadMismatch{
-		testIdxMismatch(1),
-		testIdxMismatch(2),
-	}
-
-	mismatches, err := chk.ComputeMismatchesForEntry(toChecksum("foo3", "ccc", 3))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	expected = []ReadMismatch{
-		testIdxMismatch(4),
-		testIdxMismatch(5),
-		testIdxMismatch(6),
-	}
-
-	mismatches, err = chk.ComputeMismatchesForEntry(toChecksum("qux7", "ddd", 7))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	expected = []ReadMismatch{
-		testIdxMismatch(8),
-		testIdxMismatch(9),
-		testIdxMismatch(10),
-	}
-	testMismatches(t, expected, chk.Drain())
-}
-
-func TestComputeMismatchMismatchesEntryMismatches(t *testing.T) {
-	reader := buildTestReader(IndexChecksumBlockBatch{
-		Checksums: []int64{4},
-		EndMarker: []byte("foo3"),
-	}, IndexChecksumBlockBatch{
-		Checksums: []int64{5},
-		EndMarker: []byte("goo5"),
-	}, IndexChecksumBlockBatch{
-		Checksums: []int64{6},
-		EndMarker: []byte("moo6"),
-	}, IndexChecksumBlockBatch{
-		Checksums: []int64{7},
-		EndMarker: []byte("qux7"),
-	})
-
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-	expected := []ReadMismatch{
-		testEntryMismatch("abc1", "ccc", 1),
-	}
-
-	mismatches, err := chk.ComputeMismatchesForEntry(toChecksum("abc1", "ccc", 1))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	expected = []ReadMismatch{
-		testEntryMismatch("def2", "ddd", 2),
-	}
-
-	mismatches, err = chk.ComputeMismatchesForEntry(toChecksum("def2", "ddd", 2))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	expected = []ReadMismatch{
-		testEntryMismatch("foo3", "f1", 3),
-	}
-
-	mismatches, err = chk.ComputeMismatchesForEntry(toChecksum("foo3", "f1", 3))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	expected = []ReadMismatch{
-		testIdxMismatch(5),
-	}
-
-	mismatches, err = chk.ComputeMismatchesForEntry(toChecksum("moo6", "a", 6))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	expected = []ReadMismatch{
-		testIdxMismatch(7),
-		testEntryMismatch("zoo10", "z", 10),
-	}
-
-	mismatches, err = chk.ComputeMismatchesForEntry(toChecksum("zoo10", "z", 10))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	assert.Equal(t, 0, len(chk.Drain()))
-}
-
-func TestComputeMismatchMismatchesOvershoot(t *testing.T) {
-	reader := buildTestReader(IndexChecksumBlockBatch{
-		Checksums: []int64{1, 2, 3},
-		EndMarker: []byte("foo3"),
-	}, IndexChecksumBlockBatch{
-		Checksums: []int64{4, 5, 10},
-		EndMarker: []byte("goo10"),
-	})
-
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-	expected := []ReadMismatch{
-		testEntryMismatch("abc10", "ccc", 10),
-	}
-
-	mismatches, err := chk.ComputeMismatchesForEntry(toChecksum("abc10", "ccc", 10))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	expected = []ReadMismatch{
-		testIdxMismatch(1),
-		testIdxMismatch(2),
-		testIdxMismatch(3),
-		testIdxMismatch(4),
-		testIdxMismatch(5),
-		testIdxMismatch(10),
-		testEntryMismatch("zzz20", "ccc", 20),
-	}
-
-	mismatches, err = chk.ComputeMismatchesForEntry(toChecksum("zzz20", "ccc", 20))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	assert.Equal(t, 0, len(chk.Drain()))
-}
-
-func TestComputeMismatchMismatchesEntryMismatchSkipsFirst(t *testing.T) {
-	reader := buildTestReader(IndexChecksumBlockBatch{
-		Checksums: []int64{4},
-		EndMarker: []byte("foo3"),
-	})
-
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-	expected := []ReadMismatch{
-		testEntryMismatch("foo3", "abc", 3),
-	}
-
-	mismatches, err := chk.ComputeMismatchesForEntry(toChecksum("foo3", "abc", 3))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	assert.Equal(t, 0, len(chk.Drain()))
-}
-
-func TestComputeMismatchMismatchesEntryMismatchMatchesLast(t *testing.T) {
-	reader := buildTestReader(IndexChecksumBlockBatch{
-		Checksums: []int64{1, 2, 3},
-		EndMarker: []byte("foo3"),
-	})
-
-	chk := NewEntryChecksumMismatchChecker(reader, buildOpts(t))
-	expected := []ReadMismatch{
-		testIdxMismatch(1),
-		testIdxMismatch(2),
-	}
-
-	mismatches, err := chk.ComputeMismatchesForEntry(toChecksum("foo3", "abc", 3))
-	require.NoError(t, err)
-	testMismatches(t, expected, mismatches)
-
-	assert.Equal(t, 0, len(chk.Drain()))
-}
diff --git a/src/dbnode/persist/fs/wide/index_checksum_block_batch_reader.go b/src/dbnode/persist/fs/wide/index_checksum_block_batch_reader.go
deleted file mode 100644
index 1a02a3fb53..0000000000
--- a/src/dbnode/persist/fs/wide/index_checksum_block_batch_reader.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE
-
-package wide
-
-import (
-	"sync"
-)
-
-type indexChecksumBlockReader struct {
-	mu     sync.Mutex
-	closed bool
-
-	currentBlock IndexChecksumBlockBatch
-	blocks       chan IndexChecksumBlockBatch
-}
-
-// NewIndexChecksumBlockBatchReader creates a new IndexChecksumBlockBatchReader.
-func NewIndexChecksumBlockBatchReader(
-	blockInput chan IndexChecksumBlockBatch,
-) IndexChecksumBlockBatchReader {
-	return &indexChecksumBlockReader{
-		blocks: blockInput,
-	}
-}
-
-func (b *indexChecksumBlockReader) Current() IndexChecksumBlockBatch {
-	return b.currentBlock
-}
-
-func (b *indexChecksumBlockReader) Next() bool {
-	b.mu.Lock()
-	defer b.mu.Unlock()
-
-	if b.closed {
-		return false
-	}
-
-	if bl, ok := <-b.blocks; ok {
-		b.currentBlock = bl
-		return true
-	}
-
-	b.closed = true
-	return false
-}
-
-func (b *indexChecksumBlockReader) Close() {
-	b.mu.Lock()
-	defer b.mu.Unlock()
-
-	if b.closed {
-		return
-	}
-
-	// NB: drain block channel.
-	for range b.blocks {
-	}
-
-	b.closed = true
-	return
-}
diff --git a/src/dbnode/persist/fs/wide/index_checksum_block_batch_reader_test.go b/src/dbnode/persist/fs/wide/index_checksum_block_batch_reader_test.go
deleted file mode 100644
index 12e8ed6171..0000000000
--- a/src/dbnode/persist/fs/wide/index_checksum_block_batch_reader_test.go
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE
-
-package wide
-
-import (
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestIndexChecksumBlockBatchReader(t *testing.T) {
-	ch := make(chan IndexChecksumBlockBatch)
-	buf := NewIndexChecksumBlockBatchReader(ch)
-	bl := IndexChecksumBlockBatch{EndMarker: []byte("foo")}
-	bl2 := IndexChecksumBlockBatch{
-		Checksums: []int64{1, 2, 3},
-		EndMarker: []byte("bar"),
-	}
-
-	go func() {
-		ch <- bl
-		ch <- bl2
-		close(ch)
-	}()
-
-	assert.True(t, buf.Next())
-	assert.Equal(t, bl, buf.Current())
-	assert.True(t, buf.Next())
-	assert.Equal(t, bl2, buf.Current())
-	assert.False(t, buf.Next())
-}
diff --git a/src/dbnode/persist/fs/wide/options.go b/src/dbnode/persist/fs/wide/options.go
deleted file mode 100644
index ae6a7d5ee3..0000000000
--- a/src/dbnode/persist/fs/wide/options.go
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-package wide
-
-import (
-	"errors"
-	"fmt"
-
-	"github.com/m3db/m3/src/dbnode/persist/fs/msgpack"
-	"github.com/m3db/m3/src/x/instrument"
-	"github.com/m3db/m3/src/x/pool"
-)
-
-const (
-	defaultbatchSize = 1024
-)
-
-var (
-	errDecodingOptionsUnset  = errors.New("decoding options unset")
-	invalidBatchSizeTemplate = "batch size %d must be greater than 0"
-)
-
-type options struct {
-	batchSize       int
-	bytesPool       pool.BytesPool
-	decodingOptions msgpack.DecodingOptions
-	instrumentOpts  instrument.Options
-}
-
-// NewOptions creates a new set of wide query options.
-func NewOptions() Options {
-	return &options{
-		batchSize:      defaultbatchSize,
-		instrumentOpts: instrument.NewOptions(),
-	}
-}
-
-func (o *options) Validate() error {
-	if o.decodingOptions == nil {
-		return errDecodingOptionsUnset
-	}
-	if o.batchSize < 1 {
-		return fmt.Errorf(invalidBatchSizeTemplate, o.batchSize)
-	}
-	return nil
-}
-
-func (o *options) SetBatchSize(value int) Options {
-	opts := *o
-	opts.batchSize = value
-	return &opts
-}
-
-func (o *options) BatchSize() int {
-	return o.batchSize
-}
-
-func (o *options) SetDecodingOptions(value msgpack.DecodingOptions) Options {
-	opts := *o
-	opts.decodingOptions = value
-	return &opts
-}
-
-func (o *options) DecodingOptions() msgpack.DecodingOptions {
-	return o.decodingOptions
-}
-
-func (o *options) SetInstrumentOptions(value instrument.Options) Options {
-	opts := *o
-	opts.instrumentOpts = value
-	return &opts
-}
-
-func (o *options) InstrumentOptions() instrument.Options {
-	return o.instrumentOpts
-}
diff --git a/src/dbnode/persist/fs/wide/options_test.go b/src/dbnode/persist/fs/wide/options_test.go
deleted file mode 100644
index 65d5442fed..0000000000
--- a/src/dbnode/persist/fs/wide/options_test.go
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-package wide
-
-import (
-	"testing"
-
-	"github.com/m3db/m3/src/dbnode/persist/fs/msgpack"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestOptions(t *testing.T) {
-	opts := NewOptions()
-	assert.Error(t, opts.Validate())
-
-	decOpts := msgpack.NewDecodingOptions()
-	opts = opts.SetDecodingOptions(decOpts)
-	assert.Equal(t, decOpts, opts.DecodingOptions())
-	assert.NoError(t, opts.Validate())
-
-	opts = opts.SetBatchSize(-1)
-	assert.Error(t, opts.Validate())
-
-	opts = opts.SetBatchSize(100)
-	assert.Equal(t, 100, opts.BatchSize())
-}
diff --git a/src/dbnode/persist/fs/wide/types.go b/src/dbnode/persist/fs/wide/types.go
deleted file mode 100644
index 4b11230c5b..0000000000
--- a/src/dbnode/persist/fs/wide/types.go
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE
-
-package wide
-
-import (
-	"github.com/m3db/m3/src/dbnode/persist/fs/msgpack"
-	"github.com/m3db/m3/src/dbnode/x/xio"
-	"github.com/m3db/m3/src/x/checked"
-	"github.com/m3db/m3/src/x/instrument"
-)
-
-// Options represents the options for mismatch calculation.
-type Options interface {
-	// Validate will validate the options and return an error if not valid.
-	Validate() error
-
-	// SetBatchSize sets the batch size.
-	SetBatchSize(value int) Options
-
-	// BatchSize returns the batch size.
-	BatchSize() int
-
-	// SetDecodingOptions sets the decoding options.
-	SetDecodingOptions(value msgpack.DecodingOptions) Options
-
-	// DecodingOptions returns the decoding options.
-	DecodingOptions() msgpack.DecodingOptions
-
-	// SetInstrumentOptions sets the instrumentation options.
-	SetInstrumentOptions(value instrument.Options) Options
-
-	// InstrumentOptions returns the instrumentation options.
-	InstrumentOptions() instrument.Options
-}
-
-// ReadMismatch describes a series that does not match the expected wide index
-// checksum, with a descriptor of the mismatch. This can indicate both scenarios
-// where the expected checksum was not found, and when there is a mismatch.
-type ReadMismatch struct {
-	// ReadMismatch extends IndexChecksum with additional mismatch fields.
-	xio.IndexChecksum
-	// Data is the data for the read mismatch. Set only on reader mismatches.
-	Data checked.Bytes
-}
-
-// IsReaderMismatch is true if this mismatch is this mismatch is on the reader
-// side.
-func (r ReadMismatch) IsReaderMismatch() bool {
-	return r.IndexChecksum.ID != nil ||
-		r.IndexChecksum.EncodedTags != nil
-}
-
-// IndexChecksumBlockBatchReader is a reader across IndexChecksumBlockBatches.
-type IndexChecksumBlockBatchReader interface {
-	// Next moves to the next IndexChecksumBlockBatch element.
-	Next() bool
-	// Current yields the current IndexChecksumBlockBatch.
-	Current() IndexChecksumBlockBatch
-	// Close closes the reader, draining any incoming reads without using them.
-	Close()
-}
-
-// EntryChecksumMismatchChecker checks if a given entry should yield a mismatch.
-type EntryChecksumMismatchChecker interface {
-	// ComputeMismatchesForEntry determines if the given index entry is a mismatch.
-	ComputeMismatchesForEntry(entry xio.IndexChecksum) ([]ReadMismatch, error)
-	// Drain returns any unconsumed IndexChecksumBlockBatches as mismatches.
-	Drain() []ReadMismatch
-	// Lock sets a mutex on this mismatch checker.
-	Lock()
-	// Unlock unlocks the mutex on the mismatch checker.
-	Unlock()
-}
-
-// StreamedMismatch yields a ReadMismatch value asynchronously,
-// and any errors encountered during execution.
-type StreamedMismatch interface {
-	// RetrieveMismatch retrieves the mismatch.
-	RetrieveMismatch() (ReadMismatch, error)
-}
-
-type emptyStreamedMismatch struct{}
-
-func (emptyStreamedMismatch) RetrieveMismatch() (ReadMismatch, error) {
-	return ReadMismatch{}, nil
-}
-
-// EmptyStreamedMismatch is an empty streamed mismatch batch.
-var EmptyStreamedMismatch StreamedMismatch = emptyStreamedMismatch{}
-
-// IndexChecksumBlockBatch represents a batch of index checksums originating
-// from a single series block.
-type IndexChecksumBlockBatch struct {
-	// Checksums is the list of index checksums.
-	Checksums []int64
-	// EndMarker is a batch marker, signifying the ID of the
-	// last element in the batch.
-	EndMarker []byte
-}
diff --git a/src/dbnode/persist/fs/wide/wide_mock.go b/src/dbnode/persist/fs/wide/wide_mock.go
deleted file mode 100644
index 4650401438..0000000000
--- a/src/dbnode/persist/fs/wide/wide_mock.go
+++ /dev/null
@@ -1,147 +0,0 @@
-// Code generated by MockGen. DO NOT EDIT.
-// Source: github.com/m3db/m3/src/dbnode/persist/fs/wide (interfaces: EntryChecksumMismatchChecker,StreamedMismatch)
-
-// Copyright (c) 2020 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-// Package wide is a generated GoMock package.
-package wide
-
-import (
-	"reflect"
-
-	"github.com/m3db/m3/src/dbnode/x/xio"
-
-	"github.com/golang/mock/gomock"
-)
-
-// MockEntryChecksumMismatchChecker is a mock of EntryChecksumMismatchChecker interface
-type MockEntryChecksumMismatchChecker struct {
-	ctrl     *gomock.Controller
-	recorder *MockEntryChecksumMismatchCheckerMockRecorder
-}
-
-// MockEntryChecksumMismatchCheckerMockRecorder is the mock recorder for MockEntryChecksumMismatchChecker
-type MockEntryChecksumMismatchCheckerMockRecorder struct {
-	mock *MockEntryChecksumMismatchChecker
-}
-
-// NewMockEntryChecksumMismatchChecker creates a new mock instance
-func NewMockEntryChecksumMismatchChecker(ctrl *gomock.Controller) *MockEntryChecksumMismatchChecker {
-	mock := &MockEntryChecksumMismatchChecker{ctrl: ctrl}
-	mock.recorder = &MockEntryChecksumMismatchCheckerMockRecorder{mock}
-	return mock
-}
-
-// EXPECT returns an object that allows the caller to indicate expected use
-func (m *MockEntryChecksumMismatchChecker) EXPECT() *MockEntryChecksumMismatchCheckerMockRecorder {
-	return m.recorder
-}
-
-// ComputeMismatchesForEntry mocks base method
-func (m *MockEntryChecksumMismatchChecker) ComputeMismatchesForEntry(arg0 xio.IndexChecksum) ([]ReadMismatch, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "ComputeMismatchesForEntry", arg0)
-	ret0, _ := ret[0].([]ReadMismatch)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// ComputeMismatchesForEntry indicates an expected call of ComputeMismatchesForEntry
-func (mr *MockEntryChecksumMismatchCheckerMockRecorder) ComputeMismatchesForEntry(arg0 interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ComputeMismatchesForEntry", reflect.TypeOf((*MockEntryChecksumMismatchChecker)(nil).ComputeMismatchesForEntry), arg0)
-}
-
-// Drain mocks base method
-func (m *MockEntryChecksumMismatchChecker) Drain() []ReadMismatch {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "Drain")
-	ret0, _ := ret[0].([]ReadMismatch)
-	return ret0
-}
-
-// Drain indicates an expected call of Drain
-func (mr *MockEntryChecksumMismatchCheckerMockRecorder) Drain() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Drain", reflect.TypeOf((*MockEntryChecksumMismatchChecker)(nil).Drain))
-}
-
-// Lock mocks base method
-func (m *MockEntryChecksumMismatchChecker) Lock() {
-	m.ctrl.T.Helper()
-	m.ctrl.Call(m, "Lock")
-}
-
-// Lock indicates an expected call of Lock
-func (mr *MockEntryChecksumMismatchCheckerMockRecorder) Lock() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Lock", reflect.TypeOf((*MockEntryChecksumMismatchChecker)(nil).Lock))
-}
-
-// Unlock mocks base method
-func (m *MockEntryChecksumMismatchChecker) Unlock() {
-	m.ctrl.T.Helper()
-	m.ctrl.Call(m, "Unlock")
-}
-
-// Unlock indicates an expected call of Unlock
-func (mr *MockEntryChecksumMismatchCheckerMockRecorder) Unlock() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Unlock", reflect.TypeOf((*MockEntryChecksumMismatchChecker)(nil).Unlock))
-}
-
-// MockStreamedMismatch is a mock of StreamedMismatch interface
-type MockStreamedMismatch struct {
-	ctrl     *gomock.Controller
-	recorder *MockStreamedMismatchMockRecorder
-}
-
-// MockStreamedMismatchMockRecorder is the mock recorder for MockStreamedMismatch
-type MockStreamedMismatchMockRecorder struct {
-	mock *MockStreamedMismatch
-}
-
-// NewMockStreamedMismatch creates a new mock instance
-func NewMockStreamedMismatch(ctrl *gomock.Controller) *MockStreamedMismatch {
-	mock := &MockStreamedMismatch{ctrl: ctrl}
-	mock.recorder = &MockStreamedMismatchMockRecorder{mock}
-	return mock
-}
-
-// EXPECT returns an object that allows the caller to indicate expected use
-func (m *MockStreamedMismatch) EXPECT() *MockStreamedMismatchMockRecorder {
-	return m.recorder
-}
-
-// RetrieveMismatch mocks base method
-func (m *MockStreamedMismatch) RetrieveMismatch() (ReadMismatch, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "RetrieveMismatch")
-	ret0, _ := ret[0].(ReadMismatch)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// RetrieveMismatch indicates an expected call of RetrieveMismatch
-func (mr *MockStreamedMismatchMockRecorder) RetrieveMismatch() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RetrieveMismatch", reflect.TypeOf((*MockStreamedMismatch)(nil).RetrieveMismatch))
-}
diff --git a/src/dbnode/persist/schema/types.go b/src/dbnode/persist/schema/types.go
index 1c2a3f7d3c..52ab6b233d 100644
--- a/src/dbnode/persist/schema/types.go
+++ b/src/dbnode/persist/schema/types.go
@@ -75,10 +75,10 @@ type IndexEntry struct {
 	IndexChecksum int64
 }
 
-// IndexChecksum extends IndexEntry for use with queries, by providing
+// WideEntry extends IndexEntry for use with queries, by providing
 // an additional metadata checksum field.
-type IndexChecksum struct {
-	// IndexChecksum embeds IndexEntry.
+type WideEntry struct {
+	// WideEntry embeds IndexEntry.
 	IndexEntry
 	// MetadataChecksum is the computed index metadata checksum.
 	// NB: built from ID, DataChecksum, and tags.
diff --git a/src/dbnode/storage/block/block_mock.go b/src/dbnode/storage/block/block_mock.go
index 544b9c4998..a3462a1224 100644
--- a/src/dbnode/storage/block/block_mock.go
+++ b/src/dbnode/storage/block/block_mock.go
@@ -30,7 +30,6 @@ import (
 
 	"github.com/m3db/m3/src/dbnode/encoding"
 	"github.com/m3db/m3/src/dbnode/namespace"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/ts"
 	"github.com/m3db/m3/src/dbnode/x/xio"
@@ -833,42 +832,54 @@ func (mr *MockOnReadBlockMockRecorder) OnReadBlock(b interface{}) *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnReadBlock", reflect.TypeOf((*MockOnReadBlock)(nil).OnReadBlock), b)
 }
 
-// MockStreamedChecksum is a mock of StreamedChecksum interface
-type MockStreamedChecksum struct {
+// MockStreamedWideEntry is a mock of StreamedWideEntry interface
+type MockStreamedWideEntry struct {
 	ctrl     *gomock.Controller
-	recorder *MockStreamedChecksumMockRecorder
+	recorder *MockStreamedWideEntryMockRecorder
 }
 
-// MockStreamedChecksumMockRecorder is the mock recorder for MockStreamedChecksum
-type MockStreamedChecksumMockRecorder struct {
-	mock *MockStreamedChecksum
+// MockStreamedWideEntryMockRecorder is the mock recorder for MockStreamedWideEntry
+type MockStreamedWideEntryMockRecorder struct {
+	mock *MockStreamedWideEntry
 }
 
-// NewMockStreamedChecksum creates a new mock instance
-func NewMockStreamedChecksum(ctrl *gomock.Controller) *MockStreamedChecksum {
-	mock := &MockStreamedChecksum{ctrl: ctrl}
-	mock.recorder = &MockStreamedChecksumMockRecorder{mock}
+// NewMockStreamedWideEntry creates a new mock instance
+func NewMockStreamedWideEntry(ctrl *gomock.Controller) *MockStreamedWideEntry {
+	mock := &MockStreamedWideEntry{ctrl: ctrl}
+	mock.recorder = &MockStreamedWideEntryMockRecorder{mock}
 	return mock
 }
 
 // EXPECT returns an object that allows the caller to indicate expected use
-func (m *MockStreamedChecksum) EXPECT() *MockStreamedChecksumMockRecorder {
+func (m *MockStreamedWideEntry) EXPECT() *MockStreamedWideEntryMockRecorder {
 	return m.recorder
 }
 
-// RetrieveIndexChecksum mocks base method
-func (m *MockStreamedChecksum) RetrieveIndexChecksum() (xio.IndexChecksum, error) {
+// Finalize mocks base method
+func (m *MockStreamedWideEntry) Finalize() {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "RetrieveIndexChecksum")
-	ret0, _ := ret[0].(xio.IndexChecksum)
+	m.ctrl.Call(m, "Finalize")
+}
+
+// Finalize indicates an expected call of Finalize
+func (mr *MockStreamedWideEntryMockRecorder) Finalize() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Finalize", reflect.TypeOf((*MockStreamedWideEntry)(nil).Finalize))
+}
+
+// RetrieveWideEntry mocks base method
+func (m *MockStreamedWideEntry) RetrieveWideEntry() (xio.WideEntry, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "RetrieveWideEntry")
+	ret0, _ := ret[0].(xio.WideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// RetrieveIndexChecksum indicates an expected call of RetrieveIndexChecksum
-func (mr *MockStreamedChecksumMockRecorder) RetrieveIndexChecksum() *gomock.Call {
+// RetrieveWideEntry indicates an expected call of RetrieveWideEntry
+func (mr *MockStreamedWideEntryMockRecorder) RetrieveWideEntry() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RetrieveIndexChecksum", reflect.TypeOf((*MockStreamedChecksum)(nil).RetrieveIndexChecksum))
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RetrieveWideEntry", reflect.TypeOf((*MockStreamedWideEntry)(nil).RetrieveWideEntry))
 }
 
 // MockDatabaseBlockRetriever is a mock of DatabaseBlockRetriever interface
@@ -923,34 +934,19 @@ func (mr *MockDatabaseBlockRetrieverMockRecorder) Stream(ctx, shard, id, blockSt
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stream", reflect.TypeOf((*MockDatabaseBlockRetriever)(nil).Stream), ctx, shard, id, blockStart, onRetrieve, nsCtx)
 }
 
-// StreamIndexChecksum mocks base method
-func (m *MockDatabaseBlockRetriever) StreamIndexChecksum(ctx context.Context, shard uint32, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (StreamedChecksum, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "StreamIndexChecksum", ctx, shard, id, blockStart, nsCtx)
-	ret0, _ := ret[0].(StreamedChecksum)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// StreamIndexChecksum indicates an expected call of StreamIndexChecksum
-func (mr *MockDatabaseBlockRetrieverMockRecorder) StreamIndexChecksum(ctx, shard, id, blockStart, nsCtx interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamIndexChecksum", reflect.TypeOf((*MockDatabaseBlockRetriever)(nil).StreamIndexChecksum), ctx, shard, id, blockStart, nsCtx)
-}
-
-// StreamReadMismatches mocks base method
-func (m *MockDatabaseBlockRetriever) StreamReadMismatches(ctx context.Context, shard uint32, mismatchChecker wide.EntryChecksumMismatchChecker, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (wide.StreamedMismatch, error) {
+// StreamWideEntry mocks base method
+func (m *MockDatabaseBlockRetriever) StreamWideEntry(ctx context.Context, shard uint32, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (StreamedWideEntry, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "StreamReadMismatches", ctx, shard, mismatchChecker, id, blockStart, nsCtx)
-	ret0, _ := ret[0].(wide.StreamedMismatch)
+	ret := m.ctrl.Call(m, "StreamWideEntry", ctx, shard, id, blockStart, nsCtx)
+	ret0, _ := ret[0].(StreamedWideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// StreamReadMismatches indicates an expected call of StreamReadMismatches
-func (mr *MockDatabaseBlockRetrieverMockRecorder) StreamReadMismatches(ctx, shard, mismatchChecker, id, blockStart, nsCtx interface{}) *gomock.Call {
+// StreamWideEntry indicates an expected call of StreamWideEntry
+func (mr *MockDatabaseBlockRetrieverMockRecorder) StreamWideEntry(ctx, shard, id, blockStart, nsCtx interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamReadMismatches", reflect.TypeOf((*MockDatabaseBlockRetriever)(nil).StreamReadMismatches), ctx, shard, mismatchChecker, id, blockStart, nsCtx)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamWideEntry", reflect.TypeOf((*MockDatabaseBlockRetriever)(nil).StreamWideEntry), ctx, shard, id, blockStart, nsCtx)
 }
 
 // AssignShardSet mocks base method
@@ -1003,34 +999,19 @@ func (mr *MockDatabaseShardBlockRetrieverMockRecorder) Stream(ctx, id, blockStar
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stream", reflect.TypeOf((*MockDatabaseShardBlockRetriever)(nil).Stream), ctx, id, blockStart, onRetrieve, nsCtx)
 }
 
-// StreamIndexChecksum mocks base method
-func (m *MockDatabaseShardBlockRetriever) StreamIndexChecksum(ctx context.Context, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (StreamedChecksum, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "StreamIndexChecksum", ctx, id, blockStart, nsCtx)
-	ret0, _ := ret[0].(StreamedChecksum)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// StreamIndexChecksum indicates an expected call of StreamIndexChecksum
-func (mr *MockDatabaseShardBlockRetrieverMockRecorder) StreamIndexChecksum(ctx, id, blockStart, nsCtx interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamIndexChecksum", reflect.TypeOf((*MockDatabaseShardBlockRetriever)(nil).StreamIndexChecksum), ctx, id, blockStart, nsCtx)
-}
-
-// StreamReadMismatches mocks base method
-func (m *MockDatabaseShardBlockRetriever) StreamReadMismatches(ctx context.Context, mismatchChecker wide.EntryChecksumMismatchChecker, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (wide.StreamedMismatch, error) {
+// StreamWideEntry mocks base method
+func (m *MockDatabaseShardBlockRetriever) StreamWideEntry(ctx context.Context, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (StreamedWideEntry, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "StreamReadMismatches", ctx, mismatchChecker, id, blockStart, nsCtx)
-	ret0, _ := ret[0].(wide.StreamedMismatch)
+	ret := m.ctrl.Call(m, "StreamWideEntry", ctx, id, blockStart, nsCtx)
+	ret0, _ := ret[0].(StreamedWideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// StreamReadMismatches indicates an expected call of StreamReadMismatches
-func (mr *MockDatabaseShardBlockRetrieverMockRecorder) StreamReadMismatches(ctx, mismatchChecker, id, blockStart, nsCtx interface{}) *gomock.Call {
+// StreamWideEntry indicates an expected call of StreamWideEntry
+func (mr *MockDatabaseShardBlockRetrieverMockRecorder) StreamWideEntry(ctx, id, blockStart, nsCtx interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamReadMismatches", reflect.TypeOf((*MockDatabaseShardBlockRetriever)(nil).StreamReadMismatches), ctx, mismatchChecker, id, blockStart, nsCtx)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamWideEntry", reflect.TypeOf((*MockDatabaseShardBlockRetriever)(nil).StreamWideEntry), ctx, id, blockStart, nsCtx)
 }
 
 // MockDatabaseBlockRetrieverManager is a mock of DatabaseBlockRetrieverManager interface
diff --git a/src/dbnode/storage/block/retriever_manager.go b/src/dbnode/storage/block/retriever_manager.go
index aae5943fc5..979735bf78 100644
--- a/src/dbnode/storage/block/retriever_manager.go
+++ b/src/dbnode/storage/block/retriever_manager.go
@@ -25,7 +25,6 @@ import (
 	"time"
 
 	"github.com/m3db/m3/src/dbnode/namespace"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/x/xio"
 	"github.com/m3db/m3/src/x/context"
@@ -113,27 +112,16 @@ func (r *shardBlockRetriever) Stream(
 		blockStart, onRetrieve, nsCtx)
 }
 
-func (r *shardBlockRetriever) StreamIndexChecksum(
+func (r *shardBlockRetriever) StreamWideEntry(
 	ctx context.Context,
 	id ident.ID,
 	blockStart time.Time,
 	nsCtx namespace.Context,
-) (StreamedChecksum, error) {
-	return r.DatabaseBlockRetriever.StreamIndexChecksum(ctx, r.shard, id,
+) (StreamedWideEntry, error) {
+	return r.DatabaseBlockRetriever.StreamWideEntry(ctx, r.shard, id,
 		blockStart, nsCtx)
 }
 
-func (r *shardBlockRetriever) StreamReadMismatches(
-	ctx context.Context,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	id ident.ID,
-	blockStart time.Time,
-	nsCtx namespace.Context,
-) (wide.StreamedMismatch, error) {
-	return r.DatabaseBlockRetriever.StreamReadMismatches(ctx, r.shard,
-		mismatchChecker, id, blockStart, nsCtx)
-}
-
 type shardBlockRetrieverManager struct {
 	sync.RWMutex
 	retriever       DatabaseBlockRetriever
diff --git a/src/dbnode/storage/block/types.go b/src/dbnode/storage/block/types.go
index 6bf349633f..34a2161ab8 100644
--- a/src/dbnode/storage/block/types.go
+++ b/src/dbnode/storage/block/types.go
@@ -25,7 +25,6 @@ import (
 
 	"github.com/m3db/m3/src/dbnode/encoding"
 	"github.com/m3db/m3/src/dbnode/namespace"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/topology"
 	"github.com/m3db/m3/src/dbnode/ts"
@@ -34,6 +33,7 @@ import (
 	"github.com/m3db/m3/src/x/context"
 	"github.com/m3db/m3/src/x/ident"
 	"github.com/m3db/m3/src/x/pool"
+	"github.com/m3db/m3/src/x/resource"
 	xsync "github.com/m3db/m3/src/x/sync"
 	xtime "github.com/m3db/m3/src/x/time"
 )
@@ -268,21 +268,26 @@ type RetrievableBlockMetadata struct {
 	Checksum uint32
 }
 
-// StreamedChecksum yields a xio.IndexChecksum value asynchronously,
+// StreamedWideEntry yields a xio.WideEntry value asynchronously,
 // and any errors encountered during execution.
-type StreamedChecksum interface {
-	// RetrieveIndexChecksum retrieves the index checksum.
-	RetrieveIndexChecksum() (xio.IndexChecksum, error)
+type StreamedWideEntry interface {
+	resource.Finalizer
+
+	// RetrieveWideEntry retrieves the collected wide entry.
+	RetrieveWideEntry() (xio.WideEntry, error)
 }
 
-type emptyStreamedChecksum struct{}
+type emptyWideEntry struct{}
+
+func (emptyWideEntry) RetrieveWideEntry() (xio.WideEntry, error) {
+	return xio.WideEntry{}, nil
+}
 
-func (emptyStreamedChecksum) RetrieveIndexChecksum() (xio.IndexChecksum, error) {
-	return xio.IndexChecksum{}, nil
+func (emptyWideEntry) Finalize() {
 }
 
-// EmptyStreamedChecksum is an empty streamed checksum.
-var EmptyStreamedChecksum StreamedChecksum = emptyStreamedChecksum{}
+// EmptyStreamedWideEntry is an empty streamed wide entry.
+var EmptyStreamedWideEntry StreamedWideEntry = emptyWideEntry{}
 
 // DatabaseBlockRetriever is a block retriever.
 type DatabaseBlockRetriever interface {
@@ -300,27 +305,17 @@ type DatabaseBlockRetriever interface {
 		nsCtx namespace.Context,
 	) (xio.BlockReader, error)
 
-	// StreamIndexChecksum will stream the index checksum for a given id within
-	// a block, yielding an index checksum if it is available in the shard.
-	StreamIndexChecksum(
-		ctx context.Context,
-		shard uint32,
-		id ident.ID,
-		blockStart time.Time,
-		nsCtx namespace.Context,
-	) (StreamedChecksum, error)
-
-	// StreamReadMismatches will stream reader mismatches for a given id within
-	// a block, yielding any streamed checksums within the shard.
-	StreamReadMismatches(
+	// StreamWideEntry will stream the wide entry for a given ID within
+	// a block, yielding a wide entry if it is available in the shard.
+	StreamWideEntry(
 		ctx context.Context,
 		shard uint32,
-		mismatchChecker wide.EntryChecksumMismatchChecker,
 		id ident.ID,
 		blockStart time.Time,
 		nsCtx namespace.Context,
-	) (wide.StreamedMismatch, error)
+	) (StreamedWideEntry, error)
 
+	// AssignShardSet assigns the given shard set to this retriever.
 	AssignShardSet(shardSet sharding.ShardSet)
 }
 
@@ -335,24 +330,14 @@ type DatabaseShardBlockRetriever interface {
 		nsCtx namespace.Context,
 	) (xio.BlockReader, error)
 
-	// StreamIndexChecksum will stream the index checksum for a given id within
-	// a block, yielding an index checksum if available.
-	StreamIndexChecksum(
-		ctx context.Context,
-		id ident.ID,
-		blockStart time.Time,
-		nsCtx namespace.Context,
-	) (StreamedChecksum, error)
-
-	// StreamReadMismatches will stream read index mismatches for a given id
-	// within a block, yielding any read mismatches.
-	StreamReadMismatches(
+	// StreamWideEntry will stream the wide entry for a given ID within
+	// a block, yielding a wide entry if available.
+	StreamWideEntry(
 		ctx context.Context,
-		mismatchChecker wide.EntryChecksumMismatchChecker,
 		id ident.ID,
 		blockStart time.Time,
 		nsCtx namespace.Context,
-	) (wide.StreamedMismatch, error)
+	) (StreamedWideEntry, error)
 }
 
 // DatabaseBlockRetrieverManager creates and holds block retrievers
diff --git a/src/dbnode/storage/database.go b/src/dbnode/storage/database.go
index 77e7fa4236..5f6ae35ea2 100644
--- a/src/dbnode/storage/database.go
+++ b/src/dbnode/storage/database.go
@@ -30,7 +30,6 @@ import (
 
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist/fs/commitlog"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/storage/block"
 	dberrors "github.com/m3db/m3/src/dbnode/storage/errors"
@@ -994,7 +993,7 @@ func (d *db) WideQuery(
 	queryStart time.Time,
 	shards []uint32,
 	iterOpts index.IterationOptions,
-) ([]xio.IndexChecksum, error) { // FIXME: change when exact type known.
+) ([]xio.WideEntry, error) { // nolint FIXME: change when exact type known.
 	n, err := d.namespaceFor(namespace)
 	if err != nil {
 		d.metrics.unknownNamespaceRead.Inc(1)
@@ -1005,7 +1004,7 @@ func (d *db) WideQuery(
 		batchSize = d.opts.WideBatchSize()
 		blockSize = n.Options().IndexOptions().BlockSize()
 
-		collectedChecksums = make([]xio.IndexChecksum, 0, 10)
+		collectedChecksums = make([]xio.WideEntry, 0, 10)
 	)
 
 	opts, err := index.NewWideQueryOptions(queryStart, batchSize, blockSize, shards, iterOpts)
@@ -1027,30 +1026,24 @@ func (d *db) WideQuery(
 
 	defer sp.Finish()
 
-	streamedChecksums := make([]block.StreamedChecksum, 0, batchSize)
+	streamedWideEntries := make([]block.StreamedWideEntry, 0, batchSize)
 	indexChecksumProcessor := func(batch *ident.IDBatch) error {
-		streamedChecksums = streamedChecksums[:0]
+		streamedWideEntries = streamedWideEntries[:0]
 		for _, id := range batch.IDs {
-			streamedChecksum, err := d.fetchIndexChecksum(ctx, n, id, start)
+			streamedWideEntry, err := d.fetchWideEntries(ctx, n, id, start)
 			if err != nil {
 				return err
 			}
 
-			streamedChecksums = append(streamedChecksums, streamedChecksum)
+			streamedWideEntries = append(streamedWideEntries, streamedWideEntry)
 		}
 
-		for i, streamedChecksum := range streamedChecksums {
-			checksum, err := streamedChecksum.RetrieveIndexChecksum()
+		for _, streamedWideEntry := range streamedWideEntries {
+			checksum, err := streamedWideEntry.RetrieveWideEntry()
 			if err != nil {
 				return err
 			}
 
-			// TODO: use index checksum value to call downstreams.
-			useID := i == len(batch.IDs)-1
-			if !useID {
-				checksum.ID.Finalize()
-			}
-
 			collectedChecksums = append(collectedChecksums, checksum)
 		}
 
@@ -1065,13 +1058,13 @@ func (d *db) WideQuery(
 	return collectedChecksums, nil
 }
 
-func (d *db) fetchIndexChecksum(
+func (d *db) fetchWideEntries(
 	ctx context.Context,
 	ns databaseNamespace,
 	id ident.ID,
 	start time.Time,
-) (block.StreamedChecksum, error) {
-	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.DBIndexChecksum)
+) (block.StreamedWideEntry, error) {
+	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.DBWideEntry)
 	if sampled {
 		sp.LogFields(
 			opentracinglog.String("namespace", ns.ID().String()),
@@ -1081,100 +1074,8 @@ func (d *db) fetchIndexChecksum(
 	}
 
 	defer sp.Finish()
-	return ns.FetchIndexChecksum(ctx, id, start)
-}
-
-func (d *db) ReadMismatches(
-	ctx context.Context,
-	namespace ident.ID,
-	query index.Query,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	queryStart time.Time,
-	shards []uint32,
-	iterOpts index.IterationOptions,
-) ([]wide.ReadMismatch, error) { // TODO: update this type when reader hooked up
-	n, err := d.namespaceFor(namespace)
-	if err != nil {
-		d.metrics.unknownNamespaceRead.Inc(1)
-		return nil, err
-	}
-
-	var (
-		batchSize = d.opts.WideBatchSize()
-		blockSize = n.Options().IndexOptions().BlockSize()
 
-		collectedMismatches = make([]wide.ReadMismatch, 0, 10)
-	)
-
-	opts, err := index.NewWideQueryOptions(queryStart, batchSize, blockSize, shards, iterOpts)
-	if err != nil {
-		return nil, err
-	}
-
-	start, end := opts.StartInclusive, opts.EndExclusive
-	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.DBReadMismatches)
-	if sampled {
-		sp.LogFields(
-			opentracinglog.String("readMismatches", query.String()),
-			opentracinglog.String("namespace", namespace.String()),
-			opentracinglog.Int("batchSize", batchSize),
-			xopentracing.Time("start", start),
-			xopentracing.Time("end", end),
-		)
-	}
-
-	defer sp.Finish()
-
-	streamedMismatches := make([]wide.StreamedMismatch, 0, batchSize)
-	streamMismatchProcessor := func(batch *ident.IDBatch) error {
-		streamedMismatches = streamedMismatches[:0]
-		for _, id := range batch.IDs {
-			streamedMismatch, err := d.fetchReadMismatch(ctx, n, mismatchChecker, id, start)
-			if err != nil {
-				return err
-			}
-
-			streamedMismatches = append(streamedMismatches, streamedMismatch)
-		}
-
-		for _, streamedMismatch := range streamedMismatches {
-			mismatch, err := streamedMismatch.RetrieveMismatch()
-			if err != nil {
-				return err
-			}
-
-			collectedMismatches = append(collectedMismatches, mismatch)
-		}
-
-		return nil
-	}
-
-	err = d.batchProcessWideQuery(ctx, n, query, streamMismatchProcessor, opts)
-	if err != nil {
-		return nil, err
-	}
-
-	return collectedMismatches, nil
-}
-
-func (d *db) fetchReadMismatch(
-	ctx context.Context,
-	ns databaseNamespace,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	id ident.ID,
-	start time.Time,
-) (wide.StreamedMismatch, error) {
-	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.DBFetchMismatch)
-	if sampled {
-		sp.LogFields(
-			opentracinglog.String("namespace", ns.ID().String()),
-			opentracinglog.String("id", id.String()),
-			xopentracing.Time("start", start),
-		)
-	}
-
-	defer sp.Finish()
-	return ns.FetchReadMismatch(ctx, mismatchChecker, id, start)
+	return ns.FetchWideEntry(ctx, id, start)
 }
 
 func (d *db) FetchBlocks(
diff --git a/src/dbnode/storage/database_test.go b/src/dbnode/storage/database_test.go
index b4406a7e72..88d3af5540 100644
--- a/src/dbnode/storage/database_test.go
+++ b/src/dbnode/storage/database_test.go
@@ -33,7 +33,6 @@ import (
 	"github.com/m3db/m3/src/dbnode/client"
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist/fs/commitlog"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/retention"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/storage/block"
@@ -311,7 +310,7 @@ func TestDatabaseWideQueryNamespaceNonExistent(t *testing.T) {
 	require.True(t, dberrors.IsUnknownNamespaceError(err))
 }
 
-func TestDatabaseIndexChecksum(t *testing.T) {
+func TestDatabaseWideEntry(t *testing.T) {
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
 
@@ -328,33 +327,34 @@ func TestDatabaseIndexChecksum(t *testing.T) {
 	end := time.Now()
 	start := end.Add(-time.Hour)
 
-	indexChecksumWithID := block.NewMockStreamedChecksum(ctrl)
-	indexChecksumWithID.EXPECT().RetrieveIndexChecksum().
+	indexChecksumWithID := block.NewMockStreamedWideEntry(ctrl)
+	indexChecksumWithID.EXPECT().RetrieveWideEntry().
 		Return(
-			xio.IndexChecksum{
+			xio.WideEntry{
 				ID:               ident.StringID("foo"),
 				MetadataChecksum: 5,
 			}, nil)
 	mockNamespace := NewMockdatabaseNamespace(ctrl)
-	mockNamespace.EXPECT().FetchIndexChecksum(ctx, seriesID, start).
+	mockNamespace.EXPECT().FetchWideEntry(ctx, seriesID, start).
 		Return(indexChecksumWithID, nil)
 
-	indexChecksumWithoutID := block.NewMockStreamedChecksum(ctrl)
-	indexChecksumWithoutID.EXPECT().RetrieveIndexChecksum().
-		Return(xio.IndexChecksum{MetadataChecksum: 7}, nil)
-	mockNamespace.EXPECT().FetchIndexChecksum(ctx, seriesID, start).
+	indexChecksumWithoutID := block.NewMockStreamedWideEntry(ctrl)
+	indexChecksumWithoutID.EXPECT().RetrieveWideEntry().
+		Return(xio.WideEntry{MetadataChecksum: 7}, nil)
+	mockNamespace.EXPECT().FetchWideEntry(ctx, seriesID, start).
 		Return(indexChecksumWithoutID, nil)
 	d.namespaces.Set(nsID, mockNamespace)
 
-	res, err := d.fetchIndexChecksum(ctx, mockNamespace, seriesID, start)
+	res, err := d.fetchWideEntries(ctx, mockNamespace, seriesID, start)
 	require.NoError(t, err)
-	checksum, err := res.RetrieveIndexChecksum()
+	checksum, err := res.RetrieveWideEntry()
 	require.NoError(t, err)
 	assert.Equal(t, "foo", checksum.ID.String())
 	assert.Equal(t, 5, int(checksum.MetadataChecksum))
 
-	res, err = d.fetchIndexChecksum(ctx, mockNamespace, seriesID, start)
-	checksum, err = res.RetrieveIndexChecksum()
+	res, err = d.fetchWideEntries(ctx, mockNamespace, seriesID, start)
+	require.NoError(t, err)
+	checksum, err = res.RetrieveWideEntry()
 	require.NoError(t, err)
 	require.NoError(t, err)
 	assert.Nil(t, checksum.ID)
@@ -953,9 +953,9 @@ func TestWideQuery(t *testing.T) {
 		ctx context.Context, t *testing.T, ctrl *gomock.Controller,
 		ns *MockdatabaseNamespace, d *db, q index.Query,
 		now time.Time, shards []uint32, iterOpts index.IterationOptions) {
-		ns.EXPECT().FetchIndexChecksum(gomock.Any(),
+		ns.EXPECT().FetchWideEntry(gomock.Any(),
 			ident.StringID("foo"), gomock.Any()).
-			Return(block.EmptyStreamedChecksum, nil)
+			Return(block.EmptyStreamedWideEntry, nil)
 
 		_, err := d.WideQuery(ctx, ident.StringID("testns"), q, now, shards, iterOpts)
 		require.NoError(t, err)
@@ -965,7 +965,7 @@ func TestWideQuery(t *testing.T) {
 	}
 
 	exSpans := []string{
-		tracepoint.DBIndexChecksum,
+		tracepoint.DBWideEntry,
 		tracepoint.DBWideQuery,
 		tracepoint.DBWideQuery,
 		"root",
@@ -974,35 +974,8 @@ func TestWideQuery(t *testing.T) {
 	testWideFunction(t, readMismatchTest, exSpans)
 }
 
-func TestReadMismatches(t *testing.T) {
-	readMismatchTest := func(
-		ctx context.Context, t *testing.T, ctrl *gomock.Controller,
-		ns *MockdatabaseNamespace, d *db, q index.Query,
-		now time.Time, shards []uint32, iterOpts index.IterationOptions) {
-		checker := wide.NewMockEntryChecksumMismatchChecker(ctrl)
-		ns.EXPECT().FetchReadMismatch(gomock.Any(), checker,
-			ident.StringID("foo"), gomock.Any()).
-			Return(wide.EmptyStreamedMismatch, nil)
-
-		_, err := d.ReadMismatches(ctx, ident.StringID("testns"), q, checker, now, shards, iterOpts)
-		require.NoError(t, err)
-
-		_, err = d.ReadMismatches(ctx, ident.StringID("testns"), q, checker, now, nil, iterOpts)
-		require.Error(t, err)
-	}
-
-	exSpans := []string{
-		tracepoint.DBFetchMismatch,
-		tracepoint.DBReadMismatches,
-		tracepoint.DBReadMismatches,
-		"root",
-	}
-
-	testWideFunction(t, readMismatchTest, exSpans)
-}
-
 func testWideFunction(t *testing.T, testFn wideQueryTestFn, exSpans []string) {
-	ctrl := xtest.NewController(t)
+	ctrl := gomock.NewController(t)
 	defer ctrl.Finish()
 
 	d, mapCh, _ := defaultTestDatabase(t, ctrl, BootstrapNotStarted)
@@ -1029,9 +1002,10 @@ func testWideFunction(t *testing.T, testFn wideQueryTestFn, exSpans []string) {
 		}
 
 		now      = time.Now()
+		start    = now.Truncate(2 * time.Hour)
 		iterOpts = index.IterationOptions{}
 		wideOpts = index.WideQueryOptions{
-			StartInclusive:   now.Truncate(2 * time.Hour),
+			StartInclusive:   start,
 			EndExclusive:     now.Truncate(2 * time.Hour).Add(2 * time.Hour),
 			IterationOptions: iterOpts,
 			BatchSize:        1024,
@@ -1064,7 +1038,7 @@ func testWideFunction(t *testing.T, testFn wideQueryTestFn, exSpans []string) {
 	ns.EXPECT().WideQueryIDs(gomock.Any(), q, gomock.Any(), gomock.Any()).
 		Return(fmt.Errorf("random err"))
 
-	testFn(ctx, t, ctrl, ns, d, q, now, shards, iterOpts)
+	testFn(ctx, t, ctrl, ns, d, q, start, shards, iterOpts)
 	ns.EXPECT().Close().Return(nil)
 	// Ensure commitlog is set before closing because this will call commitlog.Close()
 	d.commitLog = commitLog
diff --git a/src/dbnode/storage/index/query_options.go b/src/dbnode/storage/index/query_options.go
index 1355e70232..ff7ffb5b4a 100644
--- a/src/dbnode/storage/index/query_options.go
+++ b/src/dbnode/storage/index/query_options.go
@@ -54,7 +54,7 @@ var (
 
 // NewWideQueryOptions creates a new wide query options, snapped to block start.
 func NewWideQueryOptions(
-	queryStart time.Time,
+	blockStart time.Time,
 	batchSize int,
 	blockSize time.Duration,
 	shards []uint32,
@@ -68,8 +68,11 @@ func NewWideQueryOptions(
 		return WideQueryOptions{}, fmt.Errorf(errInvalidBlockSize, blockSize)
 	}
 
-	start := queryStart.Truncate(blockSize)
-	end := start.Add(blockSize)
+	if !blockStart.Equal(blockStart.Truncate(blockSize)) {
+		return WideQueryOptions{},
+			fmt.Errorf("block start not divisible by block size: start=%v, size=%s",
+				blockStart.String(), blockSize.String())
+	}
 
 	// NB: shards queried must be sorted.
 	sort.Slice(shards, func(i, j int) bool {
@@ -77,8 +80,8 @@ func NewWideQueryOptions(
 	})
 
 	return WideQueryOptions{
-		StartInclusive:   start,
-		EndExclusive:     end,
+		StartInclusive:   blockStart,
+		EndExclusive:     blockStart.Add(blockSize),
 		BatchSize:        batchSize,
 		IterationOptions: iterOpts,
 		ShardsQueried:    shards,
diff --git a/src/dbnode/storage/index/query_options_test.go b/src/dbnode/storage/index/query_options_test.go
index 47e2585da3..01add7e973 100644
--- a/src/dbnode/storage/index/query_options_test.go
+++ b/src/dbnode/storage/index/query_options_test.go
@@ -52,7 +52,7 @@ func TestQueryOptions(t *testing.T) {
 
 func TestInvalidWideQueryOptions(t *testing.T) {
 	var (
-		now      = time.Now()
+		now      = time.Now().Truncate(time.Hour).Add(1)
 		iterOpts = IterationOptions{}
 
 		batchSize int
@@ -68,14 +68,18 @@ func TestInvalidWideQueryOptions(t *testing.T) {
 
 	blockSize = time.Minute
 	_, err = NewWideQueryOptions(now, batchSize, blockSize, nil, iterOpts)
+	require.Error(t, err)
+
+	now = now.Truncate(blockSize)
+	_, err = NewWideQueryOptions(now, batchSize, blockSize, nil, iterOpts)
 	require.NoError(t, err)
 }
 
 func TestWideQueryOptions(t *testing.T) {
 	var (
-		now       = time.Now()
 		batchSize = 100
 		blockSize = time.Hour * 2
+		now       = time.Now().Truncate(blockSize)
 		iterOpts  = IterationOptions{}
 		shards    = []uint32{100, 23, 1}
 	)
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index 78b2cbc1ea..ebceb8f8e4 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -101,7 +101,7 @@ type WideQueryOptions struct {
 	StartInclusive time.Time
 	// EndExclusive is the exclusive end for the query.
 	EndExclusive time.Time
-	// BatchSize controls IndexChecksumQuery batch size.
+	// BatchSize controls wide query batch size.
 	BatchSize int
 	// ShardsQueried are the shards to query. These must be in ascending order.
 	// If empty, all shards are queried.
diff --git a/src/dbnode/storage/index/wide_query_results_test.go b/src/dbnode/storage/index/wide_query_results_test.go
index b0ce72dd04..b0279e8bd5 100644
--- a/src/dbnode/storage/index/wide_query_results_test.go
+++ b/src/dbnode/storage/index/wide_query_results_test.go
@@ -120,8 +120,13 @@ func drainAndCheckBatches(
 }
 
 func TestWideSeriesResults(t *testing.T) {
+	var (
+		max       = 31
+		blockSize = time.Hour * 2
+		now       = time.Now().Truncate(blockSize)
+	)
+
 	// Test many different permutations of element count and batch sizes.
-	max := 31
 	for documentCount := 0; documentCount < max; documentCount++ {
 		for docBatchSize := 1; docBatchSize < max; docBatchSize++ {
 			for batchSize := 1; batchSize < max; batchSize++ {
@@ -136,7 +141,7 @@ func TestWideSeriesResults(t *testing.T) {
 				drainAndCheckBatches(t, expected, batchCh, doneCh)
 
 				wideQueryOptions, err := NewWideQueryOptions(
-					time.Now(), batchSize, time.Hour*2, nil, IterationOptions{})
+					now, batchSize, blockSize, nil, IterationOptions{})
 
 				require.NoError(t, err)
 				wideRes := NewWideQueryResults(testNs, testIDPool, nil, batchCh, wideQueryOptions)
@@ -180,6 +185,9 @@ func TestWideSeriesResultsWithShardFilter(t *testing.T) {
 
 		batchCh = make(chan *ident.IDBatch)
 		doneCh  = make(chan struct{})
+
+		blockSize = time.Hour * 2
+		now       = time.Now().Truncate(blockSize)
 	)
 
 	docs := buildDocs(documentCount, docBatchSize)
@@ -189,7 +197,7 @@ func TestWideSeriesResultsWithShardFilter(t *testing.T) {
 	drainAndCheckBatches(t, expected, batchCh, doneCh)
 
 	wideQueryOptions, err := NewWideQueryOptions(
-		time.Now(), batchSize, time.Hour*2, shards, IterationOptions{})
+		now, batchSize, blockSize, shards, IterationOptions{})
 	require.NoError(t, err)
 	filter := func(id ident.ID) (uint32, bool) {
 		i, err := strconv.Atoi(strings.TrimPrefix(id.String(), "foo"))
diff --git a/src/dbnode/storage/index_queue_forward_write_test.go b/src/dbnode/storage/index_queue_forward_write_test.go
index 15804cf2b4..8b72db6771 100644
--- a/src/dbnode/storage/index_queue_forward_write_test.go
+++ b/src/dbnode/storage/index_queue_forward_write_test.go
@@ -207,9 +207,9 @@ func TestNamespaceForwardIndexAggregateQuery(t *testing.T) {
 }
 
 func TestNamespaceForwardIndexWideQuery(t *testing.T) {
-	ctrl := gomock.NewController(t)
+	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
-	defer leaktest.CheckTimeout(t, 2*time.Second)()
+	defer leaktest.CheckTimeout(t, 5*time.Second)()
 
 	ctx := context.NewContext()
 	defer ctx.Close()
@@ -223,7 +223,10 @@ func TestNamespaceForwardIndexWideQuery(t *testing.T) {
 	// NB: query both the current and the next index block to ensure that the
 	// write was correctly indexed to both.
 	nextBlockTime := now.Add(blockSize)
-	queryTimes := []time.Time{now, nextBlockTime}
+	queryTimes := []time.Time{
+		now.Truncate(blockSize),
+		nextBlockTime.Truncate(blockSize),
+	}
 	for _, ts := range queryTimes {
 		collector := make(chan *ident.IDBatch)
 		doneCh := make(chan struct{})
diff --git a/src/dbnode/storage/index_queue_test.go b/src/dbnode/storage/index_queue_test.go
index 1b1c3e2e4c..9960da5beb 100644
--- a/src/dbnode/storage/index_queue_test.go
+++ b/src/dbnode/storage/index_queue_test.go
@@ -395,7 +395,7 @@ func TestNamespaceIndexInsertAggregateQuery(t *testing.T) {
 func TestNamespaceIndexInsertWideQuery(t *testing.T) {
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
-	defer leaktest.CheckTimeout(t, 2*time.Second)()
+	defer leaktest.CheckTimeout(t, 5*time.Second)()
 
 	ctx := context.NewContext()
 	defer ctx.Close()
@@ -408,8 +408,9 @@ func TestNamespaceIndexInsertWideQuery(t *testing.T) {
 	assert.NoError(t, err)
 	doneCh := make(chan struct{})
 	collector := make(chan *ident.IDBatch)
-	queryOpts, err := index.NewWideQueryOptions(time.Now(), 5,
-		time.Hour*2, nil, index.IterationOptions{})
+	blockSize := 2 * time.Hour
+	queryOpts, err := index.NewWideQueryOptions(time.Now().Truncate(blockSize),
+		5, blockSize, nil, index.IterationOptions{})
 	require.NoError(t, err)
 
 	expectedBatchIDs := [][]string{{"foo"}}
@@ -441,7 +442,7 @@ func TestNamespaceIndexInsertWideQuery(t *testing.T) {
 func TestNamespaceIndexInsertWideQueryFilteredByShard(t *testing.T) {
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
-	defer leaktest.CheckTimeout(t, 2*time.Second)()
+	defer leaktest.CheckTimeout(t, 5*time.Second)()
 
 	ctx := context.NewContext()
 	defer ctx.Close()
@@ -456,8 +457,9 @@ func TestNamespaceIndexInsertWideQueryFilteredByShard(t *testing.T) {
 	collector := make(chan *ident.IDBatch)
 	shard := testShardSet.Lookup(ident.StringID("foo"))
 	offShard := shard + 1
-	queryOpts, err := index.NewWideQueryOptions(time.Now(), 5, time.Hour*2,
-		[]uint32{offShard}, index.IterationOptions{})
+	blockSize := 2 * time.Hour
+	queryOpts, err := index.NewWideQueryOptions(time.Now().Truncate(blockSize),
+		5, blockSize, []uint32{offShard}, index.IterationOptions{})
 	require.NoError(t, err)
 
 	go func() {
diff --git a/src/dbnode/storage/namespace.go b/src/dbnode/storage/namespace.go
index 5367b17574..bc5fc27311 100644
--- a/src/dbnode/storage/namespace.go
+++ b/src/dbnode/storage/namespace.go
@@ -31,7 +31,6 @@ import (
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist"
 	"github.com/m3db/m3/src/dbnode/persist/fs"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/storage/block"
 	"github.com/m3db/m3/src/dbnode/storage/bootstrap"
@@ -918,36 +917,22 @@ func (n *dbNamespace) ReadEncoded(
 	return res, err
 }
 
-func (n *dbNamespace) FetchIndexChecksum(
+func (n *dbNamespace) FetchWideEntry(
 	ctx context.Context,
 	id ident.ID,
 	blockStart time.Time,
-) (block.StreamedChecksum, error) {
+) (block.StreamedWideEntry, error) {
 	callStart := n.nowFn()
 	shard, nsCtx, err := n.readableShardFor(id)
 	if err != nil {
 		n.metrics.read.ReportError(n.nowFn().Sub(callStart))
-		return block.EmptyStreamedChecksum, err
-	}
-	res, err := shard.FetchIndexChecksum(ctx, id, blockStart, nsCtx)
-	n.metrics.read.ReportSuccessOrError(err, n.nowFn().Sub(callStart))
-	return res, err
-}
 
-func (n *dbNamespace) FetchReadMismatch(
-	ctx context.Context,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	id ident.ID,
-	blockStart time.Time,
-) (wide.StreamedMismatch, error) {
-	callStart := n.nowFn()
-	shard, nsCtx, err := n.readableShardFor(id)
-	if err != nil {
-		n.metrics.read.ReportError(n.nowFn().Sub(callStart))
-		return wide.EmptyStreamedMismatch, err
+		return block.EmptyStreamedWideEntry, err
 	}
-	res, err := shard.FetchReadMismatch(ctx, mismatchChecker, id, blockStart, nsCtx)
+
+	res, err := shard.FetchWideEntry(ctx, id, blockStart, nsCtx)
 	n.metrics.read.ReportSuccessOrError(err, n.nowFn().Sub(callStart))
+
 	return res, err
 }
 
diff --git a/src/dbnode/storage/options.go b/src/dbnode/storage/options.go
index 999de9a99e..f5c79e9890 100644
--- a/src/dbnode/storage/options.go
+++ b/src/dbnode/storage/options.go
@@ -167,6 +167,7 @@ type options struct {
 	schemaReg                       namespace.SchemaRegistry
 	blockLeaseManager               block.LeaseManager
 	onColdFlush                     OnColdFlush
+	iterationOptions                index.IterationOptions
 	memoryTracker                   MemoryTracker
 	mmapReporter                    mmap.Reporter
 	doNotIndexWithFieldsMap         map[string]string
@@ -788,6 +789,16 @@ func (o *options) OnColdFlush() OnColdFlush {
 	return o.onColdFlush
 }
 
+func (o *options) SetIterationOptions(value index.IterationOptions) Options {
+	opts := *o
+	opts.iterationOptions = value
+	return &opts
+}
+
+func (o *options) IterationOptions() index.IterationOptions {
+	return o.iterationOptions
+}
+
 func (o *options) SetMemoryTracker(memTracker MemoryTracker) Options {
 	opts := *o
 	opts.memoryTracker = memTracker
diff --git a/src/dbnode/storage/series/reader.go b/src/dbnode/storage/series/reader.go
index 32bd6e3d17..0fc9ccb44a 100644
--- a/src/dbnode/storage/series/reader.go
+++ b/src/dbnode/storage/series/reader.go
@@ -26,7 +26,6 @@ import (
 	"time"
 
 	"github.com/m3db/m3/src/dbnode/namespace"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/retention"
 	"github.com/m3db/m3/src/dbnode/storage/block"
 	"github.com/m3db/m3/src/dbnode/x/xio"
@@ -72,7 +71,7 @@ func NewReaderUsingRetriever(
 }
 
 // ReadEncoded reads encoded blocks using just a block retriever.
-func (r Reader) ReadEncoded(
+func (r *Reader) ReadEncoded(
 	ctx context.Context,
 	start, end time.Time,
 	nsCtx namespace.Context,
@@ -80,24 +79,13 @@ func (r Reader) ReadEncoded(
 	return r.readersWithBlocksMapAndBuffer(ctx, start, end, nil, nil, nsCtx)
 }
 
-func (r Reader) readersWithBlocksMapAndBuffer(
+func (r *Reader) readersWithBlocksMapAndBuffer(
 	ctx context.Context,
 	start, end time.Time,
 	seriesBlocks block.DatabaseSeriesBlocks,
 	seriesBuffer databaseBuffer,
 	nsCtx namespace.Context,
 ) ([][]xio.BlockReader, error) {
-	// Two-dimensional slice such that the first dimension is unique by blockstart
-	// and the second dimension is blocks of data for that blockstart (not necessarily
-	// in chronological order).
-	//
-	// ex. (querying 2P.M -> 6P.M with a 2-hour blocksize):
-	// [][]xio.BlockReader{
-	//   {block0, block1, block2}, // <- 2P.M
-	//   {block0, block1}, // <-4P.M
-	// }
-	var results [][]xio.BlockReader
-
 	if end.Before(start) {
 		return nil, xerrors.NewInvalidParamsError(errSeriesReadInvalidRange)
 	}
@@ -105,7 +93,6 @@ func (r Reader) readersWithBlocksMapAndBuffer(
 	var (
 		nowFn        = r.opts.ClockOptions().NowFn()
 		now          = nowFn()
-		cachePolicy  = r.opts.CachePolicy()
 		ropts        = r.opts.RetentionOptions()
 		size         = ropts.BlockSize()
 		alignedStart = start.Truncate(size)
@@ -127,8 +114,41 @@ func (r Reader) readersWithBlocksMapAndBuffer(
 		alignedEnd = latest
 	}
 
-	first, last := alignedStart, alignedEnd
-	for blockAt := first; !blockAt.After(last); blockAt = blockAt.Add(size) {
+	return r.readersWithBlocksMapAndBufferAligned(ctx, alignedStart, alignedEnd,
+		seriesBlocks, seriesBuffer, nsCtx)
+}
+
+// nolint: gocyclo
+func (r *Reader) readersWithBlocksMapAndBufferAligned(
+	ctx context.Context,
+	start, end time.Time,
+	seriesBlocks block.DatabaseSeriesBlocks,
+	seriesBuffer databaseBuffer,
+	nsCtx namespace.Context,
+) ([][]xio.BlockReader, error) {
+	var (
+		nowFn       = r.opts.ClockOptions().NowFn()
+		now         = nowFn()
+		ropts       = r.opts.RetentionOptions()
+		blockSize   = ropts.BlockSize()
+		readerCount = end.Sub(start) / blockSize
+	)
+
+	if readerCount < 0 {
+		readerCount = 0
+	}
+
+	// Two-dimensional slice such that the first dimension is unique by blockstart
+	// and the second dimension is blocks of data for that blockstart (not necessarily
+	// in chronological order).
+	//
+	// ex. (querying 2P.M -> 6P.M with a 2-hour blocksize):
+	// [][]xio.BlockReader{
+	//   {block0, block1, block2}, // <- 2P.M
+	//   {block0, block1}, // <-4P.M
+	// }
+	results := make([][]xio.BlockReader, 0, readerCount)
+	for blockAt := start; !blockAt.After(end); blockAt = blockAt.Add(blockSize) {
 		// resultsBlock holds the results from one block. The flow is:
 		// 1) Look in the cache for metrics for a block.
 		// 2) If there is nothing in the cache, try getting metrics from disk.
@@ -140,52 +160,30 @@ func (r Reader) readersWithBlocksMapAndBuffer(
 		// in an out of order error in the MultiReaderIterator on query.
 		var resultsBlock []xio.BlockReader
 
-		retrievedFromDiskCache := false
-		if seriesBlocks != nil {
-			if block, ok := seriesBlocks.BlockAt(blockAt); ok {
-				// Block served from in-memory or in-memory metadata
-				// will defer to disk read
-				streamedBlock, err := block.Stream(ctx)
-				if err != nil {
-					return nil, err
-				}
-				if streamedBlock.IsNotEmpty() {
-					resultsBlock = append(resultsBlock, streamedBlock)
-					// NB(r): Mark this block as read now
-					block.SetLastReadTime(now)
-					if r.onRead != nil {
-						r.onRead.OnReadBlock(block)
-					}
-				}
-				retrievedFromDiskCache = true
-			}
+		blockReader, block, found, err := retrieveCached(ctx, blockAt, seriesBlocks)
+		if err != nil {
+			return nil, err
 		}
 
-		// Avoid going to disk if data was already in the cache.
-		if !retrievedFromDiskCache {
-			switch {
-			case cachePolicy == CacheAll:
-				// No-op, block metadata should have been in-memory
-			case r.retriever != nil:
-				// Try to stream from disk
-				isRetrievable, err := r.retriever.IsBlockRetrievable(blockAt)
-				if err != nil {
-					return nil, err
-				}
-				if isRetrievable {
-					streamedBlock, err := r.retriever.Stream(ctx, r.id, blockAt, r.onRetrieve, nsCtx)
-					if err != nil {
-						return nil, err
-					}
-					if streamedBlock.IsNotEmpty() {
-						resultsBlock = append(resultsBlock, streamedBlock)
-					}
-				}
+		if found {
+			// NB(r): Mark this block as read now
+			block.SetLastReadTime(now)
+			if r.onRead != nil {
+				r.onRead.OnReadBlock(block)
+			}
+		} else {
+			blockReader, found, err = r.streamBlock(ctx, blockAt, r.onRetrieve, nsCtx)
+			if err != nil {
+				return nil, err
 			}
 		}
 
+		if found {
+			resultsBlock = append(resultsBlock, blockReader)
+		}
+
 		if seriesBuffer != nil {
-			bufferResults, err := seriesBuffer.ReadEncoded(ctx, blockAt, blockAt.Add(size), nsCtx)
+			bufferResults, err := seriesBuffer.ReadEncoded(ctx, blockAt, blockAt.Add(blockSize), nsCtx)
 			if err != nil {
 				return nil, err
 			}
@@ -204,12 +202,12 @@ func (r Reader) readersWithBlocksMapAndBuffer(
 	return results, nil
 }
 
-// FetchIndexChecksum reads index checksum blocks using just a block retriever.
-func (r Reader) FetchIndexChecksum(
+// FetchWideEntry reads wide entries using just a block retriever.
+func (r *Reader) FetchWideEntry(
 	ctx context.Context,
 	blockStart time.Time,
 	nsCtx namespace.Context,
-) (block.StreamedChecksum, error) {
+) (block.StreamedWideEntry, error) {
 	var (
 		nowFn = r.opts.ClockOptions().NowFn()
 		now   = nowFn()
@@ -220,71 +218,31 @@ func (r Reader) FetchIndexChecksum(
 	if blockStart.Before(earliest) {
 		// NB: this block is falling out of retention; return empty result rather
 		// than iterating over it.
-		return block.EmptyStreamedChecksum, nil
+		return block.EmptyStreamedWideEntry, nil
 	}
 
 	if r.retriever == nil {
-		return block.EmptyStreamedChecksum, nil
+		return block.EmptyStreamedWideEntry, nil
 	}
 	// Try to stream from disk
 	isRetrievable, err := r.retriever.IsBlockRetrievable(blockStart)
 	if err != nil {
-		return block.EmptyStreamedChecksum, err
+		return block.EmptyStreamedWideEntry, err
 	} else if !isRetrievable {
-		return block.EmptyStreamedChecksum, nil
+		return block.EmptyStreamedWideEntry, nil
 	}
-	streamedBlock, err := r.retriever.StreamIndexChecksum(ctx,
+	streamedEntry, err := r.retriever.StreamWideEntry(ctx,
 		r.id, blockStart, nsCtx)
 	if err != nil {
-		return block.EmptyStreamedChecksum, err
-	}
-
-	return streamedBlock, nil
-}
-
-// FetchReadMismatch compiles read mismatches using a block retriever and
-// an incoming batchReader.
-func (r Reader) FetchReadMismatch(
-	ctx context.Context,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	blockStart time.Time,
-	nsCtx namespace.Context,
-) (wide.StreamedMismatch, error) {
-	var (
-		nowFn = r.opts.ClockOptions().NowFn()
-		now   = nowFn()
-		ropts = r.opts.RetentionOptions()
-	)
-
-	earliest := retention.FlushTimeStart(ropts, now)
-	if blockStart.Before(earliest) {
-		// NB: this block is falling out of retention; return empty result rather
-		// than iterating over it.
-		return wide.EmptyStreamedMismatch, nil
-	}
-
-	if r.retriever == nil {
-		return wide.EmptyStreamedMismatch, nil
-	}
-	// Try to stream from disk
-	isRetrievable, err := r.retriever.IsBlockRetrievable(blockStart)
-	if err != nil {
-		return wide.EmptyStreamedMismatch, err
-	} else if !isRetrievable {
-		return wide.EmptyStreamedMismatch, nil
-	}
-	streamedMismatches, err := r.retriever.StreamReadMismatches(ctx,
-		mismatchChecker, r.id, blockStart, nsCtx)
-	if err != nil {
-		return wide.EmptyStreamedMismatch, err
+		return block.EmptyStreamedWideEntry, err
 	}
 
-	return streamedMismatches, nil
+	return streamedEntry, nil
 }
 
 // FetchBlocks returns data blocks given a list of block start times using
 // just a block retriever.
-func (r Reader) FetchBlocks(
+func (r *Reader) FetchBlocks(
 	ctx context.Context,
 	starts []time.Time,
 	nsCtx namespace.Context,
@@ -292,102 +250,14 @@ func (r Reader) FetchBlocks(
 	return r.fetchBlocksWithBlocksMapAndBuffer(ctx, starts, nil, nil, nsCtx)
 }
 
-func (r Reader) fetchBlocksWithBlocksMapAndBuffer(
+func (r *Reader) fetchBlocksWithBlocksMapAndBuffer(
 	ctx context.Context,
 	starts []time.Time,
 	seriesBlocks block.DatabaseSeriesBlocks,
 	seriesBuffer databaseBuffer,
 	nsCtx namespace.Context,
 ) ([]block.FetchBlockResult, error) {
-	var (
-		// Two-dimensional slice (each block.FetchBlockResult has a []xio.BlockReader internally)
-		// such that the first dimension is unique by blockstart and the second dimension is blocks
-		// of data for that blockstart (not necessarily in chronological order).
-		//
-		// ex. (querying 2P.M -> 6P.M with a 2-hour blocksize):
-		// []block.FetchBlockResult{
-		//   block.FetchBlockResult{
-		//     Start: 2P.M,
-		//     Blocks: []xio.BlockReader{block0, block1, block2},
-		//   },
-		//   block.FetchBlockResult{
-		//     Start: 4P.M,
-		//     Blocks: []xio.BlockReader{block0},
-		//   },
-		// }
-		res         = make([]block.FetchBlockResult, 0, len(starts))
-		cachePolicy = r.opts.CachePolicy()
-		// NB(r): Always use nil for OnRetrieveBlock so we don't cache the
-		// series after fetching it from disk, the fetch blocks API is called
-		// during streaming so to cache it in memory would mean we would
-		// eventually cache all series in memory when we stream results to a
-		// peer.
-		onRetrieve block.OnRetrieveBlock
-	)
-	for _, start := range starts {
-		// Slice of xio.BlockReader such that all data belong to the same blockstart.
-		var blockReaders []xio.BlockReader
-
-		retrievedFromDiskCache := false
-		if seriesBlocks != nil {
-			if b, exists := seriesBlocks.BlockAt(start); exists {
-				streamedBlock, err := b.Stream(ctx)
-				if err != nil {
-					// Short-circuit this entire blockstart if an error was encountered.
-					r := block.NewFetchBlockResult(start, nil,
-						fmt.Errorf("unable to retrieve block stream for series %s time %v: %v",
-							r.id.String(), start, err))
-					res = append(res, r)
-					continue
-				}
-
-				if streamedBlock.IsNotEmpty() {
-					blockReaders = append(blockReaders, streamedBlock)
-				}
-				retrievedFromDiskCache = true
-			}
-		}
-
-		// Avoid going to disk if data was already in the cache.
-		if !retrievedFromDiskCache {
-			switch {
-			case cachePolicy == CacheAll:
-				// No-op, block metadata should have been in-memory
-			case r.retriever != nil:
-				// Try to stream from disk
-				isRetrievable, err := r.retriever.IsBlockRetrievable(start)
-				if err != nil {
-					// Short-circuit this entire blockstart if an error was encountered.
-					r := block.NewFetchBlockResult(start, nil,
-						fmt.Errorf("unable to retrieve block stream for series %s time %v: %v",
-							r.id.String(), start, err))
-					res = append(res, r)
-					continue
-				}
-
-				if isRetrievable {
-					streamedBlock, err := r.retriever.Stream(ctx, r.id, start, onRetrieve, nsCtx)
-					if err != nil {
-						// Short-circuit this entire blockstart if an error was encountered.
-						r := block.NewFetchBlockResult(start, nil,
-							fmt.Errorf("unable to retrieve block stream for series %s time %v: %v",
-								r.id.String(), start, err))
-						res = append(res, r)
-						continue
-					}
-
-					if streamedBlock.IsNotEmpty() {
-						blockReaders = append(blockReaders, streamedBlock)
-					}
-				}
-			}
-		}
-
-		if len(blockReaders) > 0 {
-			res = append(res, block.NewFetchBlockResult(start, blockReaders, nil))
-		}
-	}
-
+	res := r.resolveBlockResults(ctx, starts, seriesBlocks, nsCtx)
 	if seriesBuffer != nil && !seriesBuffer.IsEmpty() {
 		bufferResults := seriesBuffer.FetchBlocks(ctx, starts, nsCtx)
 
@@ -395,7 +265,8 @@ func (r Reader) fetchBlocksWithBlocksMapAndBuffer(
 		block.SortFetchBlockResultByTimeAscending(res)
 		block.SortFetchBlockResultByTimeAscending(bufferResults)
 		bufferIdx := 0
-		for i, blockResult := range res {
+		for i := range res {
+			blockResult := res[i]
 			if !(bufferIdx < len(bufferResults)) {
 				break
 			}
@@ -423,3 +294,122 @@ func (r Reader) fetchBlocksWithBlocksMapAndBuffer(
 	block.SortFetchBlockResultByTimeAscending(res)
 	return res, nil
 }
+
+func (r *Reader) resolveBlockResults(
+	ctx context.Context,
+	starts []time.Time,
+	seriesBlocks block.DatabaseSeriesBlocks,
+	nsCtx namespace.Context,
+) []block.FetchBlockResult {
+	// Two-dimensional slice (each block.FetchBlockResult has a []xio.BlockReader internally)
+	// such that the first dimension is unique by blockstart and the second dimension is blocks
+	// of data for that blockstart (not necessarily in chronological order).
+	//
+	// ex. (querying 2P.M -> 6P.M with a 2-hour blocksize):
+	// []block.FetchBlockResult{
+	//   block.FetchBlockResult{
+	//     Start: 2P.M,
+	//     Blocks: []xio.BlockReader{block0, block1, block2},
+	//   },
+	//   block.FetchBlockResult{
+	//     Start: 4P.M,
+	//     Blocks: []xio.BlockReader{block0},
+	//   },
+	// }
+	res := make([]block.FetchBlockResult, 0, len(starts))
+	for _, start := range starts {
+		// Slice of xio.BlockReader such that all data belong to the same blockstart.
+		var blockReaders []xio.BlockReader
+
+		blockReader, _, found, err := retrieveCached(ctx, start, seriesBlocks)
+		if err != nil {
+			// Short-circuit this entire blockstart if an error was encountered.
+			r := block.NewFetchBlockResult(start, nil,
+				fmt.Errorf("unable to retrieve block stream for series %s time %v: %w",
+					r.id.String(), start, err))
+			res = append(res, r)
+			continue
+		}
+
+		if !found {
+			// NB(r): Always use nil for OnRetrieveBlock so we don't cache the
+			// series after fetching it from disk, the fetch blocks API is called
+			// during streaming so to cache it in memory would mean we would
+			// eventually cache all series in memory when we stream results to a
+			// peer.
+			blockReader, found, err = r.streamBlock(ctx, start, nil, nsCtx)
+			if err != nil {
+				// Short-circuit this entire blockstart if an error was encountered.
+				r := block.NewFetchBlockResult(start, nil,
+					fmt.Errorf("unable to retrieve block stream for series %s time %v: %w",
+						r.id.String(), start, err))
+				res = append(res, r)
+				continue
+			}
+		}
+
+		if found {
+			blockReaders = append(blockReaders, blockReader)
+		}
+
+		if len(blockReaders) > 0 {
+			res = append(res, block.NewFetchBlockResult(start, blockReaders, nil))
+		}
+	}
+
+	return res
+}
+
+func retrieveCached(
+	ctx context.Context,
+	start time.Time,
+	seriesBlocks block.DatabaseSeriesBlocks,
+) (xio.BlockReader, block.DatabaseBlock, bool, error) {
+	if seriesBlocks != nil {
+		if b, exists := seriesBlocks.BlockAt(start); exists {
+			streamedBlock, err := b.Stream(ctx)
+			if err != nil {
+				return xio.BlockReader{}, b, false, err
+			}
+
+			if streamedBlock.IsNotEmpty() {
+				return streamedBlock, b, true, nil
+			}
+		}
+	}
+
+	return xio.BlockReader{}, nil, false, nil
+}
+
+func (r *Reader) streamBlock(
+	ctx context.Context,
+	start time.Time,
+	onRetrieve block.OnRetrieveBlock,
+	nsCtx namespace.Context,
+) (xio.BlockReader, bool, error) {
+	cachePolicy := r.opts.CachePolicy()
+	switch {
+	case cachePolicy == CacheAll:
+		// No-op, block metadata should have been in-memory
+	case r.retriever != nil:
+		// Try to stream from disk
+		isRetrievable, err := r.retriever.IsBlockRetrievable(start)
+		if err != nil {
+			return xio.BlockReader{}, false, err
+		}
+
+		if isRetrievable {
+			streamedBlock, err := r.retriever.Stream(ctx, r.id, start, onRetrieve, nsCtx)
+			if err != nil {
+				// Short-circuit this entire blockstart if an error was encountered.
+				return xio.BlockReader{}, false, err
+			}
+
+			if streamedBlock.IsNotEmpty() {
+				return streamedBlock, true, nil
+			}
+		}
+	}
+
+	return xio.BlockReader{}, false, nil
+}
diff --git a/src/dbnode/storage/series/reader_test.go b/src/dbnode/storage/series/reader_test.go
index aeb0522e9e..70090c785c 100644
--- a/src/dbnode/storage/series/reader_test.go
+++ b/src/dbnode/storage/series/reader_test.go
@@ -88,7 +88,7 @@ func TestReaderUsingRetrieverReadEncoded(t *testing.T) {
 	}
 }
 
-func TestReaderUsingRetrieverIndexChecksumsBlockInvalid(t *testing.T) {
+func TestReaderUsingRetrieverWideEntrysBlockInvalid(t *testing.T) {
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
 
@@ -102,20 +102,20 @@ func TestReaderUsingRetrieverIndexChecksumsBlockInvalid(t *testing.T) {
 
 	retriever.EXPECT().IsBlockRetrievable(gomock.Any()).
 		Return(false, errors.New("err"))
-	_, err := reader.FetchIndexChecksum(ctx, time.Now(), namespace.Context{})
+	_, err := reader.FetchWideEntry(ctx, time.Now(), namespace.Context{})
 	assert.EqualError(t, err, "err")
 
 	retriever.EXPECT().IsBlockRetrievable(gomock.Any()).Return(false, nil)
-	c, err := reader.FetchIndexChecksum(ctx, time.Now(), namespace.Context{})
+	e, err := reader.FetchWideEntry(ctx, time.Now(), namespace.Context{})
 	assert.NoError(t, err)
 
-	checksum, err := c.RetrieveIndexChecksum()
+	entry, err := e.RetrieveWideEntry()
 	require.NoError(t, err)
-	assert.Equal(t, int64(0), checksum.MetadataChecksum)
-	assert.Nil(t, checksum.ID)
+	assert.Equal(t, int64(0), entry.MetadataChecksum)
+	assert.Nil(t, entry.ID)
 }
 
-func TestReaderUsingRetrieverIndexChecksums(t *testing.T) {
+func TestReaderUsingRetrieverWideEntrys(t *testing.T) {
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
 
@@ -128,37 +128,36 @@ func TestReaderUsingRetrieverIndexChecksums(t *testing.T) {
 	retriever := NewMockQueryableBlockRetriever(ctrl)
 	retriever.EXPECT().IsBlockRetrievable(alignedStart).Return(true, nil).Times(2)
 
-	checksum := xio.IndexChecksum{
-		MetadataChecksum: 5,
-		ID:               ident.StringID("foo"),
-	}
-
-	indexChecksum := block.NewMockStreamedChecksum(ctrl)
-
+	streamedEntry := block.NewMockStreamedWideEntry(ctrl)
 	ctx := opts.ContextPool().Get()
 	defer ctx.Close()
 
 	retriever.EXPECT().
-		StreamIndexChecksum(ctx, ident.NewIDMatcher("foo"),
+		StreamWideEntry(ctx, ident.NewIDMatcher("foo"),
 			alignedStart, gomock.Any()).
-		Return(indexChecksum, nil).Times(2)
+		Return(streamedEntry, nil).Times(2)
 
 	reader := NewReaderUsingRetriever(
 		ident.StringID("foo"), retriever, nil, nil, opts)
 
-	indexChecksum.EXPECT().RetrieveIndexChecksum().Return(xio.IndexChecksum{}, errors.New("err"))
-	streamed, err := reader.FetchIndexChecksum(ctx, alignedStart, namespace.Context{})
+	streamedEntry.EXPECT().RetrieveWideEntry().Return(xio.WideEntry{}, errors.New("err"))
+	streamed, err := reader.FetchWideEntry(ctx, alignedStart, namespace.Context{})
 	require.NoError(t, err)
-	_, err = streamed.RetrieveIndexChecksum()
+	_, err = streamed.RetrieveWideEntry()
 	assert.EqualError(t, err, "err")
 
 	// Check reads as expected
-	indexChecksum.EXPECT().RetrieveIndexChecksum().Return(checksum, nil)
-	streamed, err = reader.FetchIndexChecksum(ctx, alignedStart, namespace.Context{})
+	entry := xio.WideEntry{
+		MetadataChecksum: 5,
+		ID:               ident.StringID("foo"),
+	}
+
+	streamedEntry.EXPECT().RetrieveWideEntry().Return(entry, nil)
+	streamed, err = reader.FetchWideEntry(ctx, alignedStart, namespace.Context{})
 	require.NoError(t, err)
-	actual, err := streamed.RetrieveIndexChecksum()
+	actual, err := streamed.RetrieveWideEntry()
 	require.NoError(t, err)
-	assert.Equal(t, checksum, actual)
+	assert.Equal(t, entry, actual)
 }
 
 type readTestCase struct {
diff --git a/src/dbnode/storage/series/series.go b/src/dbnode/storage/series/series.go
index a6ea48a893..4c008db977 100644
--- a/src/dbnode/storage/series/series.go
+++ b/src/dbnode/storage/series/series.go
@@ -27,7 +27,6 @@ import (
 	"time"
 
 	"github.com/m3db/m3/src/dbnode/persist"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/storage/block"
 	"github.com/m3db/m3/src/dbnode/ts"
 	"github.com/m3db/m3/src/dbnode/x/xio"
@@ -403,29 +402,17 @@ func (s *dbSeries) ReadEncoded(
 	return r, err
 }
 
-func (s *dbSeries) FetchIndexChecksum(
+func (s *dbSeries) FetchWideEntry(
 	ctx context.Context,
 	blockStart time.Time,
 	nsCtx namespace.Context,
-) (block.StreamedChecksum, error) {
+) (block.StreamedWideEntry, error) {
 	s.RLock()
 	reader := NewReaderUsingRetriever(s.id, s.blockRetriever, s.onRetrieveBlock, s, s.opts)
-	r, err := reader.FetchIndexChecksum(ctx, blockStart, nsCtx)
+	e, err := reader.FetchWideEntry(ctx, blockStart, nsCtx)
 	s.RUnlock()
-	return r, err
-}
 
-func (s *dbSeries) FetchReadMismatch(
-	ctx context.Context,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	blockStart time.Time,
-	nsCtx namespace.Context,
-) (wide.StreamedMismatch, error) {
-	s.RLock()
-	reader := NewReaderUsingRetriever(s.id, s.blockRetriever, s.onRetrieveBlock, s, s.opts)
-	r, err := reader.FetchReadMismatch(ctx, mismatchChecker, blockStart, nsCtx)
-	s.RUnlock()
-	return r, err
+	return e, err
 }
 
 func (s *dbSeries) FetchBlocksForColdFlush(
@@ -449,12 +436,14 @@ func (s *dbSeries) FetchBlocks(
 	nsCtx namespace.Context,
 ) ([]block.FetchBlockResult, error) {
 	s.RLock()
-	r, err := Reader{
+	reader := &Reader{
 		opts:       s.opts,
 		id:         s.id,
 		retriever:  s.blockRetriever,
 		onRetrieve: s.onRetrieveBlock,
-	}.fetchBlocksWithBlocksMapAndBuffer(ctx, starts, s.cachedBlocks, s.buffer, nsCtx)
+	}
+
+	r, err := reader.fetchBlocksWithBlocksMapAndBuffer(ctx, starts, s.cachedBlocks, s.buffer, nsCtx)
 	s.RUnlock()
 	return r, err
 }
diff --git a/src/dbnode/storage/series/series_mock.go b/src/dbnode/storage/series/series_mock.go
index 3a8fdee444..cc20ef70b4 100644
--- a/src/dbnode/storage/series/series_mock.go
+++ b/src/dbnode/storage/series/series_mock.go
@@ -30,7 +30,6 @@ import (
 
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/storage/block"
 	"github.com/m3db/m3/src/dbnode/ts"
 	"github.com/m3db/m3/src/dbnode/x/xio"
@@ -150,34 +149,19 @@ func (mr *MockDatabaseSeriesMockRecorder) FetchBlocksMetadata(arg0, arg1, arg2,
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchBlocksMetadata", reflect.TypeOf((*MockDatabaseSeries)(nil).FetchBlocksMetadata), arg0, arg1, arg2, arg3)
 }
 
-// FetchIndexChecksum mocks base method
-func (m *MockDatabaseSeries) FetchIndexChecksum(arg0 context.Context, arg1 time.Time, arg2 namespace.Context) (block.StreamedChecksum, error) {
+// FetchWideEntry mocks base method
+func (m *MockDatabaseSeries) FetchWideEntry(arg0 context.Context, arg1 time.Time, arg2 namespace.Context) (block.StreamedWideEntry, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "FetchIndexChecksum", arg0, arg1, arg2)
-	ret0, _ := ret[0].(block.StreamedChecksum)
+	ret := m.ctrl.Call(m, "FetchWideEntry", arg0, arg1, arg2)
+	ret0, _ := ret[0].(block.StreamedWideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// FetchIndexChecksum indicates an expected call of FetchIndexChecksum
-func (mr *MockDatabaseSeriesMockRecorder) FetchIndexChecksum(arg0, arg1, arg2 interface{}) *gomock.Call {
+// FetchWideEntry indicates an expected call of FetchWideEntry
+func (mr *MockDatabaseSeriesMockRecorder) FetchWideEntry(arg0, arg1, arg2 interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchIndexChecksum", reflect.TypeOf((*MockDatabaseSeries)(nil).FetchIndexChecksum), arg0, arg1, arg2)
-}
-
-// FetchReadMismatch mocks base method
-func (m *MockDatabaseSeries) FetchReadMismatch(arg0 context.Context, arg1 wide.EntryChecksumMismatchChecker, arg2 time.Time, arg3 namespace.Context) (wide.StreamedMismatch, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "FetchReadMismatch", arg0, arg1, arg2, arg3)
-	ret0, _ := ret[0].(wide.StreamedMismatch)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// FetchReadMismatch indicates an expected call of FetchReadMismatch
-func (mr *MockDatabaseSeriesMockRecorder) FetchReadMismatch(arg0, arg1, arg2, arg3 interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchReadMismatch", reflect.TypeOf((*MockDatabaseSeries)(nil).FetchReadMismatch), arg0, arg1, arg2, arg3)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchWideEntry", reflect.TypeOf((*MockDatabaseSeries)(nil).FetchWideEntry), arg0, arg1, arg2)
 }
 
 // ID mocks base method
@@ -472,32 +456,17 @@ func (mr *MockQueryableBlockRetrieverMockRecorder) Stream(arg0, arg1, arg2, arg3
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stream", reflect.TypeOf((*MockQueryableBlockRetriever)(nil).Stream), arg0, arg1, arg2, arg3, arg4)
 }
 
-// StreamIndexChecksum mocks base method
-func (m *MockQueryableBlockRetriever) StreamIndexChecksum(arg0 context.Context, arg1 ident.ID, arg2 time.Time, arg3 namespace.Context) (block.StreamedChecksum, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "StreamIndexChecksum", arg0, arg1, arg2, arg3)
-	ret0, _ := ret[0].(block.StreamedChecksum)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// StreamIndexChecksum indicates an expected call of StreamIndexChecksum
-func (mr *MockQueryableBlockRetrieverMockRecorder) StreamIndexChecksum(arg0, arg1, arg2, arg3 interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamIndexChecksum", reflect.TypeOf((*MockQueryableBlockRetriever)(nil).StreamIndexChecksum), arg0, arg1, arg2, arg3)
-}
-
-// StreamReadMismatches mocks base method
-func (m *MockQueryableBlockRetriever) StreamReadMismatches(arg0 context.Context, arg1 wide.EntryChecksumMismatchChecker, arg2 ident.ID, arg3 time.Time, arg4 namespace.Context) (wide.StreamedMismatch, error) {
+// StreamWideEntry mocks base method
+func (m *MockQueryableBlockRetriever) StreamWideEntry(arg0 context.Context, arg1 ident.ID, arg2 time.Time, arg3 namespace.Context) (block.StreamedWideEntry, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "StreamReadMismatches", arg0, arg1, arg2, arg3, arg4)
-	ret0, _ := ret[0].(wide.StreamedMismatch)
+	ret := m.ctrl.Call(m, "StreamWideEntry", arg0, arg1, arg2, arg3)
+	ret0, _ := ret[0].(block.StreamedWideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// StreamReadMismatches indicates an expected call of StreamReadMismatches
-func (mr *MockQueryableBlockRetrieverMockRecorder) StreamReadMismatches(arg0, arg1, arg2, arg3, arg4 interface{}) *gomock.Call {
+// StreamWideEntry indicates an expected call of StreamWideEntry
+func (mr *MockQueryableBlockRetrieverMockRecorder) StreamWideEntry(arg0, arg1, arg2, arg3 interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamReadMismatches", reflect.TypeOf((*MockQueryableBlockRetriever)(nil).StreamReadMismatches), arg0, arg1, arg2, arg3, arg4)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StreamWideEntry", reflect.TypeOf((*MockQueryableBlockRetriever)(nil).StreamWideEntry), arg0, arg1, arg2, arg3)
 }
diff --git a/src/dbnode/storage/series/types.go b/src/dbnode/storage/series/types.go
index 439393e8c4..779dcdd858 100644
--- a/src/dbnode/storage/series/types.go
+++ b/src/dbnode/storage/series/types.go
@@ -26,7 +26,6 @@ import (
 	"github.com/m3db/m3/src/dbnode/encoding"
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/retention"
 	"github.com/m3db/m3/src/dbnode/runtime"
 	"github.com/m3db/m3/src/dbnode/storage/block"
@@ -87,21 +86,12 @@ type DatabaseSeries interface {
 		nsCtx namespace.Context,
 	) ([][]xio.BlockReader, error)
 
-	// FetchIndexChecksum reads checksums from encoded blocks.
-	FetchIndexChecksum(
+	// FetchWideEntry reads wide entries from encoded blocks.
+	FetchWideEntry(
 		ctx context.Context,
 		blockStart time.Time,
 		nsCtx namespace.Context,
-	) (block.StreamedChecksum, error)
-
-	// FetchIndexChecksum reads checksum mismatches from encoded blocks and the
-	// incoming batchReader.
-	FetchReadMismatch(
-		ctx context.Context,
-		mismatchChecker wide.EntryChecksumMismatchChecker,
-		blockStart time.Time,
-		nsCtx namespace.Context,
-	) (wide.StreamedMismatch, error)
+	) (block.StreamedWideEntry, error)
 
 	// FetchBlocks returns data blocks given a list of block start times.
 	FetchBlocks(
diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go
index 49f8642757..029a8b9049 100644
--- a/src/dbnode/storage/shard.go
+++ b/src/dbnode/storage/shard.go
@@ -36,7 +36,6 @@ import (
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist"
 	"github.com/m3db/m3/src/dbnode/persist/fs"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/retention"
 	"github.com/m3db/m3/src/dbnode/runtime"
 	"github.com/m3db/m3/src/dbnode/storage/block"
@@ -399,29 +398,17 @@ func (s *dbShard) Stream(
 		blockStart, onRetrieve, nsCtx)
 }
 
-// StreamIndexChecksum implements series.QueryableBlockRetriever
-func (s *dbShard) StreamIndexChecksum(
+// StreamWideEntry implements series.QueryableBlockRetriever
+func (s *dbShard) StreamWideEntry(
 	ctx context.Context,
 	id ident.ID,
 	blockStart time.Time,
 	nsCtx namespace.Context,
-) (block.StreamedChecksum, error) {
-	return s.DatabaseBlockRetriever.StreamIndexChecksum(ctx, s.shard, id,
+) (block.StreamedWideEntry, error) {
+	return s.DatabaseBlockRetriever.StreamWideEntry(ctx, s.shard, id,
 		blockStart, nsCtx)
 }
 
-// StreamIndexChecksum implements series.QueryableBlockRetriever
-func (s *dbShard) StreamReadMismatches(
-	ctx context.Context,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	id ident.ID,
-	blockStart time.Time,
-	nsCtx namespace.Context,
-) (wide.StreamedMismatch, error) {
-	return s.DatabaseBlockRetriever.StreamReadMismatches(ctx, s.shard,
-		mismatchChecker, id, blockStart, nsCtx)
-}
-
 // IsBlockRetrievable implements series.QueryableBlockRetriever
 func (s *dbShard) IsBlockRetrievable(blockStart time.Time) (bool, error) {
 	return s.hasWarmFlushed(blockStart)
@@ -1155,29 +1142,17 @@ func (s *dbShard) ReadEncoded(
 	return reader.ReadEncoded(ctx, start, end, nsCtx)
 }
 
-func (s *dbShard) FetchIndexChecksum(
+func (s *dbShard) FetchWideEntry(
 	ctx context.Context,
 	id ident.ID,
 	blockStart time.Time,
 	nsCtx namespace.Context,
-) (block.StreamedChecksum, error) {
+) (block.StreamedWideEntry, error) {
 	retriever := s.seriesBlockRetriever
 	opts := s.seriesOpts
 	reader := series.NewReaderUsingRetriever(id, retriever, nil, nil, opts)
-	return reader.FetchIndexChecksum(ctx, blockStart, nsCtx)
-}
 
-func (s *dbShard) FetchReadMismatch(
-	ctx context.Context,
-	mismatchChecker wide.EntryChecksumMismatchChecker,
-	id ident.ID,
-	blockStart time.Time,
-	nsCtx namespace.Context,
-) (wide.StreamedMismatch, error) {
-	retriever := s.seriesBlockRetriever
-	opts := s.seriesOpts
-	reader := series.NewReaderUsingRetriever(id, retriever, nil, nil, opts)
-	return reader.FetchReadMismatch(ctx, mismatchChecker, blockStart, nsCtx)
+	return reader.FetchWideEntry(ctx, blockStart, nsCtx)
 }
 
 // lookupEntryWithLock returns the entry for a given id while holding a read lock or a write lock.
diff --git a/src/dbnode/storage/shard_test.go b/src/dbnode/storage/shard_test.go
index 5c5077d801..53c1da1144 100644
--- a/src/dbnode/storage/shard_test.go
+++ b/src/dbnode/storage/shard_test.go
@@ -40,7 +40,6 @@ import (
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/persist"
 	"github.com/m3db/m3/src/dbnode/persist/fs"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/retention"
 	"github.com/m3db/m3/src/dbnode/runtime"
 	"github.com/m3db/m3/src/dbnode/storage/block"
@@ -1626,28 +1625,28 @@ func TestShardFetchIndexChecksum(t *testing.T) {
 	retriever := block.NewMockDatabaseBlockRetriever(ctrl)
 	shard.setBlockRetriever(retriever)
 
-	checksum := xio.IndexChecksum{
+	checksum := xio.WideEntry{
 		ID:               ident.StringID("foo"),
 		MetadataChecksum: 5,
 	}
 
-	indexChecksum := block.NewMockStreamedChecksum(ctrl)
+	wideEntry := block.NewMockStreamedWideEntry(ctrl)
 	retriever.EXPECT().
-		StreamIndexChecksum(ctx, shard.shard, ident.NewIDMatcher("foo"),
-			start, gomock.Any()).Return(indexChecksum, nil).Times(2)
+		StreamWideEntry(ctx, shard.shard, ident.NewIDMatcher("foo"),
+			start, gomock.Any()).Return(wideEntry, nil).Times(2)
 
-	// First call to RetrieveIndexChecksum is expected to error on retrieval
-	indexChecksum.EXPECT().RetrieveIndexChecksum().
-		Return(xio.IndexChecksum{}, errors.New("err"))
-	r, err := shard.FetchIndexChecksum(ctx, ident.StringID("foo"), start, namespace.Context{})
+	// First call to RetrieveWideEntry is expected to error on retrieval
+	wideEntry.EXPECT().RetrieveWideEntry().
+		Return(xio.WideEntry{}, errors.New("err"))
+	r, err := shard.FetchWideEntry(ctx, ident.StringID("foo"), start, namespace.Context{})
 	require.NoError(t, err)
-	_, err = r.RetrieveIndexChecksum()
+	_, err = r.RetrieveWideEntry()
 	assert.EqualError(t, err, "err")
 
-	indexChecksum.EXPECT().RetrieveIndexChecksum().Return(checksum, nil)
-	r, err = shard.FetchIndexChecksum(ctx, ident.StringID("foo"), start, namespace.Context{})
+	wideEntry.EXPECT().RetrieveWideEntry().Return(checksum, nil)
+	r, err = shard.FetchWideEntry(ctx, ident.StringID("foo"), start, namespace.Context{})
 	require.NoError(t, err)
-	retrieved, err := r.RetrieveIndexChecksum()
+	retrieved, err := r.RetrieveWideEntry()
 	require.NoError(t, err)
 	assert.Equal(t, checksum, retrieved)
 
@@ -1662,77 +1661,6 @@ func TestShardFetchIndexChecksum(t *testing.T) {
 	require.Nil(t, entry)
 }
 
-func TestShardFetchReadMismatch(t *testing.T) {
-	dir, err := ioutil.TempDir("", "testdir")
-	require.NoError(t, err)
-	defer os.RemoveAll(dir)
-
-	ctrl := xtest.NewController(t)
-	defer ctrl.Finish()
-
-	opts := DefaultTestOptions().
-		SetSeriesCachePolicy(series.CacheAll)
-	fsOpts := opts.CommitLogOptions().FilesystemOptions().
-		SetFilePathPrefix(dir)
-	opts = opts.
-		SetCommitLogOptions(opts.CommitLogOptions().
-			SetFilesystemOptions(fsOpts))
-	shard := testDatabaseShard(t, opts)
-	defer shard.Close()
-
-	ctx := context.NewContext()
-	defer ctx.Close()
-
-	nsCtx := namespace.Context{ID: ident.StringID("foo")}
-	require.NoError(t, shard.Bootstrap(ctx, nsCtx))
-
-	ropts := shard.seriesOpts.RetentionOptions()
-	end := opts.ClockOptions().NowFn()().Truncate(ropts.BlockSize())
-	start := end.Add(-2 * ropts.BlockSize())
-	shard.markWarmFlushStateSuccess(start)
-	shard.markWarmFlushStateSuccess(start.Add(ropts.BlockSize()))
-
-	checker := wide.NewMockEntryChecksumMismatchChecker(ctrl)
-	retriever := block.NewMockDatabaseBlockRetriever(ctrl)
-	shard.setBlockRetriever(retriever)
-
-	mismatchBatch := wide.ReadMismatch{
-		IndexChecksum: xio.IndexChecksum{MetadataChecksum: 1},
-	}
-
-	streamedBatch := wide.NewMockStreamedMismatch(ctrl)
-	retriever.EXPECT().
-		StreamReadMismatches(ctx, shard.shard, checker, ident.NewIDMatcher("foo"),
-			start, gomock.Any()).Return(streamedBatch, nil).Times(2)
-
-	// First call to RetrieveMismatch is expected to error on retrieval
-	streamedBatch.EXPECT().RetrieveMismatch().
-		Return(wide.ReadMismatch{}, errors.New("err"))
-	r, err := shard.FetchReadMismatch(ctx, checker,
-		ident.StringID("foo"), start, namespace.Context{})
-	require.NoError(t, err)
-	_, err = r.RetrieveMismatch()
-	assert.EqualError(t, err, "err")
-
-	streamedBatch.EXPECT().RetrieveMismatch().Return(mismatchBatch, nil)
-	r, err = shard.StreamReadMismatches(ctx, checker,
-		ident.StringID("foo"), start, namespace.Context{})
-	require.NoError(t, err)
-	retrieved, err := r.RetrieveMismatch()
-	require.NoError(t, err)
-	assert.Equal(t, mismatchBatch, retrieved)
-
-	// Check that nothing has been cached. Should be cached after a second.
-	time.Sleep(time.Second)
-
-	shard.RLock()
-	entry, _, err := shard.lookupEntryWithLock(ident.StringID("foo"))
-	shard.RUnlock()
-
-	require.Equal(t, err, errShardEntryNotFound)
-	require.Nil(t, entry)
-}
-
 func TestShardReadEncodedCachesSeriesWithRecentlyReadPolicy(t *testing.T) {
 	dir, err := ioutil.TempDir("", "testdir")
 	require.NoError(t, err)
diff --git a/src/dbnode/storage/storage_mock.go b/src/dbnode/storage/storage_mock.go
index b11da30f9c..87a3c33efa 100644
--- a/src/dbnode/storage/storage_mock.go
+++ b/src/dbnode/storage/storage_mock.go
@@ -35,7 +35,6 @@ import (
 	"github.com/m3db/m3/src/dbnode/persist"
 	"github.com/m3db/m3/src/dbnode/persist/fs"
 	"github.com/m3db/m3/src/dbnode/persist/fs/commitlog"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/runtime"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/storage/block"
@@ -346,10 +345,10 @@ func (mr *MockDatabaseMockRecorder) ReadEncoded(ctx, namespace, id, start, end i
 }
 
 // WideQuery mocks base method
-func (m *MockDatabase) WideQuery(ctx context.Context, namespace ident.ID, query index.Query, start time.Time, shards []uint32, iterOpts index.IterationOptions) ([]xio.IndexChecksum, error) {
+func (m *MockDatabase) WideQuery(ctx context.Context, namespace ident.ID, query index.Query, start time.Time, shards []uint32, iterOpts index.IterationOptions) ([]xio.WideEntry, error) {
 	m.ctrl.T.Helper()
 	ret := m.ctrl.Call(m, "WideQuery", ctx, namespace, query, start, shards, iterOpts)
-	ret0, _ := ret[0].([]xio.IndexChecksum)
+	ret0, _ := ret[0].([]xio.WideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
@@ -360,21 +359,6 @@ func (mr *MockDatabaseMockRecorder) WideQuery(ctx, namespace, query, start, shar
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WideQuery", reflect.TypeOf((*MockDatabase)(nil).WideQuery), ctx, namespace, query, start, shards, iterOpts)
 }
 
-// ReadMismatches mocks base method
-func (m *MockDatabase) ReadMismatches(ctx context.Context, namespace ident.ID, query index.Query, mismatchChecker wide.EntryChecksumMismatchChecker, queryStart time.Time, shards []uint32, iterOpts index.IterationOptions) ([]wide.ReadMismatch, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "ReadMismatches", ctx, namespace, query, mismatchChecker, queryStart, shards, iterOpts)
-	ret0, _ := ret[0].([]wide.ReadMismatch)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// ReadMismatches indicates an expected call of ReadMismatches
-func (mr *MockDatabaseMockRecorder) ReadMismatches(ctx, namespace, query, mismatchChecker, queryStart, shards, iterOpts interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReadMismatches", reflect.TypeOf((*MockDatabase)(nil).ReadMismatches), ctx, namespace, query, mismatchChecker, queryStart, shards, iterOpts)
-}
-
 // FetchBlocks mocks base method
 func (m *MockDatabase) FetchBlocks(ctx context.Context, namespace ident.ID, shard uint32, id ident.ID, starts []time.Time) ([]block.FetchBlockResult, error) {
 	m.ctrl.T.Helper()
@@ -786,10 +770,10 @@ func (mr *MockdatabaseMockRecorder) ReadEncoded(ctx, namespace, id, start, end i
 }
 
 // WideQuery mocks base method
-func (m *Mockdatabase) WideQuery(ctx context.Context, namespace ident.ID, query index.Query, start time.Time, shards []uint32, iterOpts index.IterationOptions) ([]xio.IndexChecksum, error) {
+func (m *Mockdatabase) WideQuery(ctx context.Context, namespace ident.ID, query index.Query, start time.Time, shards []uint32, iterOpts index.IterationOptions) ([]xio.WideEntry, error) {
 	m.ctrl.T.Helper()
 	ret := m.ctrl.Call(m, "WideQuery", ctx, namespace, query, start, shards, iterOpts)
-	ret0, _ := ret[0].([]xio.IndexChecksum)
+	ret0, _ := ret[0].([]xio.WideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
@@ -800,21 +784,6 @@ func (mr *MockdatabaseMockRecorder) WideQuery(ctx, namespace, query, start, shar
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WideQuery", reflect.TypeOf((*Mockdatabase)(nil).WideQuery), ctx, namespace, query, start, shards, iterOpts)
 }
 
-// ReadMismatches mocks base method
-func (m *Mockdatabase) ReadMismatches(ctx context.Context, namespace ident.ID, query index.Query, mismatchChecker wide.EntryChecksumMismatchChecker, queryStart time.Time, shards []uint32, iterOpts index.IterationOptions) ([]wide.ReadMismatch, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "ReadMismatches", ctx, namespace, query, mismatchChecker, queryStart, shards, iterOpts)
-	ret0, _ := ret[0].([]wide.ReadMismatch)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// ReadMismatches indicates an expected call of ReadMismatches
-func (mr *MockdatabaseMockRecorder) ReadMismatches(ctx, namespace, query, mismatchChecker, queryStart, shards, iterOpts interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReadMismatches", reflect.TypeOf((*Mockdatabase)(nil).ReadMismatches), ctx, namespace, query, mismatchChecker, queryStart, shards, iterOpts)
-}
-
 // FetchBlocks mocks base method
 func (m *Mockdatabase) FetchBlocks(ctx context.Context, namespace ident.ID, shard uint32, id ident.ID, starts []time.Time) ([]block.FetchBlockResult, error) {
 	m.ctrl.T.Helper()
@@ -1499,34 +1468,19 @@ func (mr *MockdatabaseNamespaceMockRecorder) ReadEncoded(ctx, id, start, end int
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReadEncoded", reflect.TypeOf((*MockdatabaseNamespace)(nil).ReadEncoded), ctx, id, start, end)
 }
 
-// FetchIndexChecksum mocks base method
-func (m *MockdatabaseNamespace) FetchIndexChecksum(ctx context.Context, id ident.ID, blockStart time.Time) (block.StreamedChecksum, error) {
+// FetchWideEntry mocks base method
+func (m *MockdatabaseNamespace) FetchWideEntry(ctx context.Context, id ident.ID, blockStart time.Time) (block.StreamedWideEntry, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "FetchIndexChecksum", ctx, id, blockStart)
-	ret0, _ := ret[0].(block.StreamedChecksum)
+	ret := m.ctrl.Call(m, "FetchWideEntry", ctx, id, blockStart)
+	ret0, _ := ret[0].(block.StreamedWideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// FetchIndexChecksum indicates an expected call of FetchIndexChecksum
-func (mr *MockdatabaseNamespaceMockRecorder) FetchIndexChecksum(ctx, id, blockStart interface{}) *gomock.Call {
+// FetchWideEntry indicates an expected call of FetchWideEntry
+func (mr *MockdatabaseNamespaceMockRecorder) FetchWideEntry(ctx, id, blockStart interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchIndexChecksum", reflect.TypeOf((*MockdatabaseNamespace)(nil).FetchIndexChecksum), ctx, id, blockStart)
-}
-
-// FetchReadMismatch mocks base method
-func (m *MockdatabaseNamespace) FetchReadMismatch(ctx context.Context, mismatchChecker wide.EntryChecksumMismatchChecker, id ident.ID, blockStart time.Time) (wide.StreamedMismatch, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "FetchReadMismatch", ctx, mismatchChecker, id, blockStart)
-	ret0, _ := ret[0].(wide.StreamedMismatch)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// FetchReadMismatch indicates an expected call of FetchReadMismatch
-func (mr *MockdatabaseNamespaceMockRecorder) FetchReadMismatch(ctx, mismatchChecker, id, blockStart interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchReadMismatch", reflect.TypeOf((*MockdatabaseNamespace)(nil).FetchReadMismatch), ctx, mismatchChecker, id, blockStart)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchWideEntry", reflect.TypeOf((*MockdatabaseNamespace)(nil).FetchWideEntry), ctx, id, blockStart)
 }
 
 // FetchBlocks mocks base method
@@ -2037,34 +1991,19 @@ func (mr *MockdatabaseShardMockRecorder) ReadEncoded(ctx, id, start, end, nsCtx
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReadEncoded", reflect.TypeOf((*MockdatabaseShard)(nil).ReadEncoded), ctx, id, start, end, nsCtx)
 }
 
-// FetchIndexChecksum mocks base method
-func (m *MockdatabaseShard) FetchIndexChecksum(ctx context.Context, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (block.StreamedChecksum, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "FetchIndexChecksum", ctx, id, blockStart, nsCtx)
-	ret0, _ := ret[0].(block.StreamedChecksum)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
-}
-
-// FetchIndexChecksum indicates an expected call of FetchIndexChecksum
-func (mr *MockdatabaseShardMockRecorder) FetchIndexChecksum(ctx, id, blockStart, nsCtx interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchIndexChecksum", reflect.TypeOf((*MockdatabaseShard)(nil).FetchIndexChecksum), ctx, id, blockStart, nsCtx)
-}
-
-// FetchReadMismatch mocks base method
-func (m *MockdatabaseShard) FetchReadMismatch(ctx context.Context, mismatchChecker wide.EntryChecksumMismatchChecker, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (wide.StreamedMismatch, error) {
+// FetchWideEntry mocks base method
+func (m *MockdatabaseShard) FetchWideEntry(ctx context.Context, id ident.ID, blockStart time.Time, nsCtx namespace.Context) (block.StreamedWideEntry, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "FetchReadMismatch", ctx, mismatchChecker, id, blockStart, nsCtx)
-	ret0, _ := ret[0].(wide.StreamedMismatch)
+	ret := m.ctrl.Call(m, "FetchWideEntry", ctx, id, blockStart, nsCtx)
+	ret0, _ := ret[0].(block.StreamedWideEntry)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
-// FetchReadMismatch indicates an expected call of FetchReadMismatch
-func (mr *MockdatabaseShardMockRecorder) FetchReadMismatch(ctx, mismatchChecker, id, blockStart, nsCtx interface{}) *gomock.Call {
+// FetchWideEntry indicates an expected call of FetchWideEntry
+func (mr *MockdatabaseShardMockRecorder) FetchWideEntry(ctx, id, blockStart, nsCtx interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchReadMismatch", reflect.TypeOf((*MockdatabaseShard)(nil).FetchReadMismatch), ctx, mismatchChecker, id, blockStart, nsCtx)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchWideEntry", reflect.TypeOf((*MockdatabaseShard)(nil).FetchWideEntry), ctx, id, blockStart, nsCtx)
 }
 
 // FetchBlocks mocks base method
diff --git a/src/dbnode/storage/types.go b/src/dbnode/storage/types.go
index 185ac9897e..3e28e7f31d 100644
--- a/src/dbnode/storage/types.go
+++ b/src/dbnode/storage/types.go
@@ -31,7 +31,6 @@ import (
 	"github.com/m3db/m3/src/dbnode/persist"
 	"github.com/m3db/m3/src/dbnode/persist/fs"
 	"github.com/m3db/m3/src/dbnode/persist/fs/commitlog"
-	"github.com/m3db/m3/src/dbnode/persist/fs/wide"
 	"github.com/m3db/m3/src/dbnode/runtime"
 	"github.com/m3db/m3/src/dbnode/sharding"
 	"github.com/m3db/m3/src/dbnode/storage/block"
@@ -184,19 +183,7 @@ type Database interface {
 		start time.Time,
 		shards []uint32,
 		iterOpts index.IterationOptions,
-	) ([]xio.IndexChecksum, error) // FIXME: change when exact type known.
-
-	// ReadMismatches performs a wide blockwise query that applies a received
-	// index checksum block batch.
-	ReadMismatches(
-		ctx context.Context,
-		namespace ident.ID,
-		query index.Query,
-		mismatchChecker wide.EntryChecksumMismatchChecker,
-		queryStart time.Time,
-		shards []uint32,
-		iterOpts index.IterationOptions,
-	) ([]wide.ReadMismatch, error) // TODO: update this type when reader hooked up
+	) ([]xio.WideEntry, error) // FIXME: change when exact type known.
 
 	// FetchBlocks retrieves data blocks for a given id and a list of block
 	// start times.
@@ -381,22 +368,13 @@ type databaseNamespace interface {
 		start, end time.Time,
 	) ([][]xio.BlockReader, error)
 
-	// FetchIndexChecksum retrieves the index checksum for an ID for the
+	// FetchWideEntry retrieves the wide entry for an ID for the
 	// block at time start.
-	FetchIndexChecksum(
-		ctx context.Context,
-		id ident.ID,
-		blockStart time.Time,
-	) (block.StreamedChecksum, error)
-
-	// FetchReadMismatch retrieves the read mismatches for an ID for the
-	// block at time start, with the given batchReader.
-	FetchReadMismatch(
+	FetchWideEntry(
 		ctx context.Context,
-		mismatchChecker wide.EntryChecksumMismatchChecker,
 		id ident.ID,
 		blockStart time.Time,
-	) (wide.StreamedMismatch, error)
+	) (block.StreamedWideEntry, error)
 
 	// FetchBlocks retrieves data blocks for a given id and a list of block
 	// start times.
@@ -554,23 +532,14 @@ type databaseShard interface {
 		nsCtx namespace.Context,
 	) ([][]xio.BlockReader, error)
 
-	// FetchIndexChecksum retrieves the index checksum for an ID.
-	FetchIndexChecksum(
-		ctx context.Context,
-		id ident.ID,
-		blockStart time.Time,
-		nsCtx namespace.Context,
-	) (block.StreamedChecksum, error)
-
-	// FetchReadMismatch retrieves the read mismatches for an ID for the
-	// block at time start, with the given batchReader.
-	FetchReadMismatch(
+	// FetchWideEntry retrieves wide entry for an ID for the
+	// block at time start.
+	FetchWideEntry(
 		ctx context.Context,
-		mismatchChecker wide.EntryChecksumMismatchChecker,
 		id ident.ID,
 		blockStart time.Time,
 		nsCtx namespace.Context,
-	) (wide.StreamedMismatch, error)
+	) (block.StreamedWideEntry, error)
 
 	// FetchBlocks retrieves data blocks for a given id and a list of block
 	// start times.
diff --git a/src/dbnode/tracepoint/tracepoint.go b/src/dbnode/tracepoint/tracepoint.go
index 8f8cd66b16..69eefcfe97 100644
--- a/src/dbnode/tracepoint/tracepoint.go
+++ b/src/dbnode/tracepoint/tracepoint.go
@@ -73,8 +73,8 @@ const (
 	// DBWriteBatch is the operation name for the db WriteBatch path.
 	DBWriteBatch = "storage.db.WriteBatch"
 
-	// DBIndexChecksum is the operation name for the tchannelthrift IndexChecksum path.
-	DBIndexChecksum = "storage.db.IndexChecksum"
+	// DBWideEntry is the operation name for the tchannelthrift WideEntry path.
+	DBWideEntry = "storage.db.WideEntry"
 
 	// DBFetchMismatch is the operation name for the tchannelthrift DBFetchMismatch path.
 	DBFetchMismatch = "storage.db.FetchMismatch"
diff --git a/src/dbnode/x/xio/index_checksum.go b/src/dbnode/x/xio/wide_entry.go
similarity index 72%
rename from src/dbnode/x/xio/index_checksum.go
rename to src/dbnode/x/xio/wide_entry.go
index eeedfb31e8..e4724f4026 100644
--- a/src/dbnode/x/xio/index_checksum.go
+++ b/src/dbnode/x/xio/wide_entry.go
@@ -25,24 +25,47 @@ import (
 	"github.com/m3db/m3/src/x/ident"
 )
 
-// IndexChecksum is an entry from the index file which can be passed to
+// WideEntry is an entry from the index file which can be passed to
 // SeekUsingIndexEntry to seek to the data for that entry.
-type IndexChecksum struct {
+type WideEntry struct {
+	finalized bool
+
+	Shard            uint32
 	ID               ident.ID
 	Size             int64
 	Offset           int64
 	DataChecksum     int64
 	EncodedTags      checked.Bytes
 	MetadataChecksum int64
+	Data             checked.Bytes
+}
+
+// Empty returns whether the wide entry is empty and not found.
+func (c *WideEntry) Empty() bool {
+	return *c == WideEntry{}
 }
 
-// Finalize finalizes the index checksum.
-func (c *IndexChecksum) Finalize() {
+// Finalize finalizes the wide entry.
+func (c *WideEntry) Finalize() {
+	if c.Empty() || c.finalized {
+		return
+	}
+
+	c.finalized = true
 	if c.EncodedTags != nil {
 		c.EncodedTags.DecRef()
+		c.EncodedTags.Finalize()
+		c.EncodedTags = nil
 	}
 
-	if c.ID != nil && c.ID.Bytes() != nil {
+	if c.ID != nil {
 		c.ID.Finalize()
+		c.ID = nil
+	}
+
+	if c.Data != nil {
+		c.Data.DecRef()
+		c.Data.Finalize()
+		c.Data = nil
 	}
 }
diff --git a/src/x/checked/debug.go b/src/x/checked/debug.go
index a5f49e63d2..d19ae2476c 100644
--- a/src/x/checked/debug.go
+++ b/src/x/checked/debug.go
@@ -23,6 +23,7 @@ package checked
 import (
 	"bytes"
 	"fmt"
+	"os"
 	"runtime"
 	"sync"
 	"time"
@@ -314,4 +315,8 @@ func tracebackEvent(c *RefCount, ref int, e debuggerEvent) {
 
 func init() {
 	leaks.m = make(map[string]uint64)
+
+	if os.Getenv("DEBUG_ENABLE_TRACEBACKS") == "true" {
+		EnableTracebacks()
+	}
 }

From a3bd18a10ba0d54681b78d1945af402072afba04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linas=20Med=C5=BEi=C5=ABnas?=
 <linasm@users.noreply.github.com>
Date: Wed, 11 Nov 2020 17:30:12 +0200
Subject: [PATCH 030/106] [dbnode] Introduce Aggregator type (#2840)

---
 src/dbnode/server/options.go       |  1 +
 src/dbnode/server/server.go        |  5 +++
 src/dbnode/storage/options.go      | 25 +++++++++++
 src/dbnode/storage/storage_mock.go | 66 ++++++++++++++++++++++++++++++
 src/dbnode/storage/types.go        | 21 ++++++++++
 5 files changed, 118 insertions(+)

diff --git a/src/dbnode/server/options.go b/src/dbnode/server/options.go
index fa2cc58c2a..ba082f20bc 100644
--- a/src/dbnode/server/options.go
+++ b/src/dbnode/server/options.go
@@ -32,4 +32,5 @@ type StorageOptions struct {
 	TChanNodeServerFn      node.NewTChanNodeServerFn
 	BackgroundProcessFns   []storage.NewBackgroundProcessFn
 	NamespaceHooks         storage.NamespaceHooks
+	NewTileAggregatorFn    storage.NewTileAggregatorFn
 }
diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index c226a444e5..f2bbdb8819 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -875,6 +875,11 @@ func Run(runOpts RunOptions) {
 		opts = opts.SetNamespaceHooks(runOpts.StorageOptions.NamespaceHooks)
 	}
 
+	if runOpts.StorageOptions.NewTileAggregatorFn != nil {
+		aggregator := runOpts.StorageOptions.NewTileAggregatorFn(iopts)
+		opts = opts.SetTileAggregator(aggregator)
+	}
+
 	// Set bootstrap options - We need to create a topology map provider from the
 	// same topology that will be passed to the cluster so that when we make
 	// bootstrapping decisions they are in sync with the clustered database
diff --git a/src/dbnode/storage/options.go b/src/dbnode/storage/options.go
index f5c79e9890..2eda82f7b4 100644
--- a/src/dbnode/storage/options.go
+++ b/src/dbnode/storage/options.go
@@ -177,6 +177,7 @@ type options struct {
 	wideBatchSize                   int
 	newBackgroundProcessFns         []NewBackgroundProcessFn
 	namespaceHooks                  NamespaceHooks
+	tileAggregator                  TileAggregator
 }
 
 // NewOptions creates a new set of storage options with defaults
@@ -252,6 +253,7 @@ func newOptions(poolOpts pool.ObjectPoolOptions) Options {
 		mediatorTickInterval:            defaultMediatorTickInterval,
 		wideBatchSize:                   defaultWideBatchSize,
 		namespaceHooks:                  &noopNamespaceHooks{},
+		tileAggregator:                  &noopTileAggregator{},
 	}
 	return o.SetEncodingM3TSZPooled()
 }
@@ -891,6 +893,17 @@ func (o *options) NamespaceHooks() NamespaceHooks {
 	return o.namespaceHooks
 }
 
+func (o *options) SetTileAggregator(value TileAggregator) Options {
+	opts := *o
+	opts.tileAggregator = value
+
+	return &opts
+}
+
+func (o *options) TileAggregator() TileAggregator {
+	return o.tileAggregator
+}
+
 type noOpColdFlush struct{}
 
 func (n *noOpColdFlush) ColdFlushNamespace(Namespace) (OnColdFlushNamespace, error) {
@@ -902,3 +915,15 @@ type noopNamespaceHooks struct{}
 func (h *noopNamespaceHooks) OnCreatedNamespace(Namespace, GetNamespaceFn) error {
 	return nil
 }
+
+type noopTileAggregator struct{}
+
+func (a *noopTileAggregator) AggregateTiles(
+	opts AggregateTilesOptions,
+	ns Namespace,
+	shardID uint32,
+	readers []fs.DataFileSetReader,
+	writer fs.StreamingWriter,
+) (int64, error) {
+	return 0, nil
+}
diff --git a/src/dbnode/storage/storage_mock.go b/src/dbnode/storage/storage_mock.go
index 87a3c33efa..386e0beb13 100644
--- a/src/dbnode/storage/storage_mock.go
+++ b/src/dbnode/storage/storage_mock.go
@@ -4959,6 +4959,34 @@ func (mr *MockOptionsMockRecorder) NamespaceHooks() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NamespaceHooks", reflect.TypeOf((*MockOptions)(nil).NamespaceHooks))
 }
 
+// SetTileAggregator mocks base method
+func (m *MockOptions) SetTileAggregator(aggregator TileAggregator) Options {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "SetTileAggregator", aggregator)
+	ret0, _ := ret[0].(Options)
+	return ret0
+}
+
+// SetTileAggregator indicates an expected call of SetTileAggregator
+func (mr *MockOptionsMockRecorder) SetTileAggregator(aggregator interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetTileAggregator", reflect.TypeOf((*MockOptions)(nil).SetTileAggregator), aggregator)
+}
+
+// TileAggregator mocks base method
+func (m *MockOptions) TileAggregator() TileAggregator {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "TileAggregator")
+	ret0, _ := ret[0].(TileAggregator)
+	return ret0
+}
+
+// TileAggregator indicates an expected call of TileAggregator
+func (mr *MockOptionsMockRecorder) TileAggregator() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TileAggregator", reflect.TypeOf((*MockOptions)(nil).TileAggregator))
+}
+
 // MockMemoryTracker is a mock of MemoryTracker interface
 type MockMemoryTracker struct {
 	ctrl     *gomock.Controller
@@ -5046,6 +5074,44 @@ func (mr *MockMemoryTrackerMockRecorder) WaitForDec() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WaitForDec", reflect.TypeOf((*MockMemoryTracker)(nil).WaitForDec))
 }
 
+// MockTileAggregator is a mock of TileAggregator interface
+type MockTileAggregator struct {
+	ctrl     *gomock.Controller
+	recorder *MockTileAggregatorMockRecorder
+}
+
+// MockTileAggregatorMockRecorder is the mock recorder for MockTileAggregator
+type MockTileAggregatorMockRecorder struct {
+	mock *MockTileAggregator
+}
+
+// NewMockTileAggregator creates a new mock instance
+func NewMockTileAggregator(ctrl *gomock.Controller) *MockTileAggregator {
+	mock := &MockTileAggregator{ctrl: ctrl}
+	mock.recorder = &MockTileAggregatorMockRecorder{mock}
+	return mock
+}
+
+// EXPECT returns an object that allows the caller to indicate expected use
+func (m *MockTileAggregator) EXPECT() *MockTileAggregatorMockRecorder {
+	return m.recorder
+}
+
+// AggregateTiles mocks base method
+func (m *MockTileAggregator) AggregateTiles(opts AggregateTilesOptions, ns Namespace, shardID uint32, readers []fs.DataFileSetReader, writer fs.StreamingWriter) (int64, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "AggregateTiles", opts, ns, shardID, readers, writer)
+	ret0, _ := ret[0].(int64)
+	ret1, _ := ret[1].(error)
+	return ret0, ret1
+}
+
+// AggregateTiles indicates an expected call of AggregateTiles
+func (mr *MockTileAggregatorMockRecorder) AggregateTiles(opts, ns, shardID, readers, writer interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AggregateTiles", reflect.TypeOf((*MockTileAggregator)(nil).AggregateTiles), opts, ns, shardID, readers, writer)
+}
+
 // MockNamespaceHooks is a mock of NamespaceHooks interface
 type MockNamespaceHooks struct {
 	ctrl     *gomock.Controller
diff --git a/src/dbnode/storage/types.go b/src/dbnode/storage/types.go
index 3e28e7f31d..b9a521e785 100644
--- a/src/dbnode/storage/types.go
+++ b/src/dbnode/storage/types.go
@@ -1298,6 +1298,12 @@ type Options interface {
 
 	// NamespaceHooks returns the NamespaceHooks.
 	NamespaceHooks() NamespaceHooks
+
+	// SetTileAggregator sets the TileAggregator.
+	SetTileAggregator(aggregator TileAggregator) Options
+
+	// TileAggregator returns the TileAggregator.
+	TileAggregator() TileAggregator
 }
 
 // MemoryTracker tracks memory.
@@ -1369,6 +1375,21 @@ type AggregateTilesOptions struct {
 	InsOptions instrument.Options
 }
 
+// TileAggregator is the interface for AggregateTiles.
+type TileAggregator interface {
+	// AggregateTiles does tile aggregation.
+	AggregateTiles(
+		opts AggregateTilesOptions,
+		ns Namespace,
+		shardID uint32,
+		readers []fs.DataFileSetReader,
+		writer fs.StreamingWriter,
+	) (int64, error)
+}
+
+// NewTileAggregatorFn creates a new TileAggregator.
+type NewTileAggregatorFn func(iOpts instrument.Options) TileAggregator
+
 // NamespaceHooks allows dynamic plugging into the namespace lifecycle.
 type NamespaceHooks interface {
 	// OnCreatedNamespace gets invoked after each namespace is created.

From 3b5c0ff555710ba454834a5a53497773782810d7 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob.skillington@gmail.com>
Date: Wed, 11 Nov 2020 22:46:51 -0500
Subject: [PATCH 031/106] [coordinator] Set default namespace tag to avoid
 colliding with commonly used "namespace" label (#2878)

* [coordinator] Set default namespace tag to avoid colliding with common "namespace" default value

* Use defined constant

* Add downsampler test case to demonstrate override namespace tag

Co-authored-by: Wesley Kim <wesley@chronosphere.io>
---
 .../downsample/downsampler_test.go            | 83 +++++++++++++++++++
 .../m3coordinator/downsample/options.go       | 13 ++-
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/src/cmd/services/m3coordinator/downsample/downsampler_test.go b/src/cmd/services/m3coordinator/downsample/downsampler_test.go
index bc850de531..4b9f4aac64 100644
--- a/src/cmd/services/m3coordinator/downsample/downsampler_test.go
+++ b/src/cmd/services/m3coordinator/downsample/downsampler_test.go
@@ -1274,6 +1274,87 @@ func TestDownsamplerAggregationWithRemoteAggregatorClient(t *testing.T) {
 	testDownsamplerRemoteAggregation(t, testDownsampler)
 }
 
+func TestDownsamplerWithOverrideNamespace(t *testing.T) {
+	overrideNamespaceTag := "override_namespace_tag"
+
+	gaugeMetric := testGaugeMetric{
+		tags: map[string]string{
+			nameTag:         "http_requests",
+			"app":           "nginx_edge",
+			"status_code":   "500",
+			"endpoint":      "/foo/bar",
+			"not_rolled_up": "not_rolled_up_value",
+			// Set namespace tags on ingested metrics.
+			// The test demonstrates that overrideNamespaceTag is respected, meaning setting
+			// values on defaultNamespaceTag won't affect aggregation.
+			defaultNamespaceTag: "namespace_ignored",
+		},
+		timedSamples: []testGaugeMetricTimedSample{
+			{value: 42},
+			{value: 64, offset: 5 * time.Second},
+		},
+	}
+	res := 5 * time.Second
+	ret := 30 * 24 * time.Hour
+	testDownsampler := newTestDownsampler(t, testDownsamplerOptions{
+		rulesConfig: &RulesConfiguration{
+			RollupRules: []RollupRuleConfiguration{
+				{
+					Filter: fmt.Sprintf(
+						"%s:http_requests app:* status_code:* endpoint:*",
+						nameTag),
+					Transforms: []TransformConfiguration{
+						{
+							Transform: &TransformOperationConfiguration{
+								Type: transformation.PerSecond,
+							},
+						},
+						{
+							Rollup: &RollupOperationConfiguration{
+								MetricName:   "http_requests_by_status_code",
+								GroupBy:      []string{"app", "status_code", "endpoint"},
+								Aggregations: []aggregation.Type{aggregation.Sum},
+							},
+						},
+					},
+					StoragePolicies: []StoragePolicyConfiguration{
+						{
+							Resolution: res,
+							Retention:  ret,
+						},
+					},
+				},
+			},
+		},
+		matcherConfig: MatcherConfiguration{NamespaceTag: overrideNamespaceTag},
+		ingest: &testDownsamplerOptionsIngest{
+			gaugeMetrics: []testGaugeMetric{gaugeMetric},
+		},
+		expect: &testDownsamplerOptionsExpect{
+			writes: []testExpectedWrite{
+				{
+					tags: map[string]string{
+						nameTag:               "http_requests_by_status_code",
+						string(rollupTagName): string(rollupTagValue),
+						"app":                 "nginx_edge",
+						"status_code":         "500",
+						"endpoint":            "/foo/bar",
+					},
+					values: []expectedValue{{value: 4.4}},
+					attributes: &storagemetadata.Attributes{
+						MetricsType: storagemetadata.AggregatedMetricsType,
+						Resolution:  res,
+						Retention:   ret,
+					},
+				},
+			},
+		},
+	})
+
+	// Test expected output
+	testDownsamplerAggregation(t, testDownsampler)
+}
+
 func originalStagedMetadata(t *testing.T, testDownsampler testDownsampler) []metricpb.StagedMetadatas {
 	ds, ok := testDownsampler.downsampler.(*downsampler)
 	require.True(t, ok)
@@ -1751,6 +1832,7 @@ type testDownsamplerOptions struct {
 	sampleAppenderOpts *SampleAppenderOptions
 	remoteClientMock   *client.MockClient
 	rulesConfig        *RulesConfiguration
+	matcherConfig      MatcherConfiguration
 
 	// Test ingest and expectations overrides
 	ingest *testDownsamplerOptionsIngest
@@ -1821,6 +1903,7 @@ func newTestDownsampler(t *testing.T, opts testDownsamplerOptions) testDownsampl
 	if opts.rulesConfig != nil {
 		cfg.Rules = opts.rulesConfig
 	}
+	cfg.Matcher = opts.matcherConfig
 
 	instance, err := cfg.NewDownsampler(DownsamplerOptions{
 		Storage:                    storage,
diff --git a/src/cmd/services/m3coordinator/downsample/options.go b/src/cmd/services/m3coordinator/downsample/options.go
index b1b61df731..da966cfe00 100644
--- a/src/cmd/services/m3coordinator/downsample/options.go
+++ b/src/cmd/services/m3coordinator/downsample/options.go
@@ -86,7 +86,8 @@ const (
 )
 
 var (
-	numShards = runtime.NumCPU()
+	numShards           = runtime.NumCPU()
+	defaultNamespaceTag = metric.M3MetricsPrefixString + "_namespace__"
 
 	errNoStorage                    = errors.New("downsampling enabled with storage not set")
 	errNoClusterClient              = errors.New("downsampling enabled with cluster client not set")
@@ -267,6 +268,9 @@ type Configuration struct {
 type MatcherConfiguration struct {
 	// Cache if non-zero will set the capacity of the rules matching cache.
 	Cache MatcherCacheConfiguration `yaml:"cache"`
+	// NamespaceTag defines the namespace tag to use to select rules
+	// namespace to evaluate against. Default is "__m3_namespace__".
+	NamespaceTag string `yaml:"namespaceTag"`
 }
 
 // MatcherCacheConfiguration is the configuration for the rule matcher cache.
@@ -647,6 +651,7 @@ func (cfg Configuration) newAggregator(o DownsamplerOptions) (agg, error) {
 		logger                  = instrumentOpts.Logger()
 		openTimeout             = defaultOpenTimeout
 		m3PrefixFilter          = false
+		namespaceTag            = defaultNamespaceTag
 	)
 	if o.StorageFlushConcurrency > 0 {
 		storageFlushConcurrency = o.StorageFlushConcurrency
@@ -654,6 +659,9 @@ func (cfg Configuration) newAggregator(o DownsamplerOptions) (agg, error) {
 	if o.OpenTimeout > 0 {
 		openTimeout = o.OpenTimeout
 	}
+	if cfg.Matcher.NamespaceTag != "" {
+		namespaceTag = cfg.Matcher.NamespaceTag
+	}
 
 	pools := o.newAggregatorPools()
 	ruleSetOpts := o.newAggregatorRulesOptions(pools)
@@ -662,7 +670,8 @@ func (cfg Configuration) newAggregator(o DownsamplerOptions) (agg, error) {
 		SetClockOptions(clockOpts).
 		SetInstrumentOptions(instrumentOpts).
 		SetRuleSetOptions(ruleSetOpts).
-		SetKVStore(o.RulesKVStore)
+		SetKVStore(o.RulesKVStore).
+		SetNamespaceTag([]byte(namespaceTag))
 
 	// NB(r): If rules are being explicitly set in config then we are
 	// going to use an in memory KV store for rules and explicitly set them up.

From 567dd45d962dea642c076391103cd8ae4ee156e0 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 14 Nov 2020 00:21:53 -0500
Subject: [PATCH 032/106] Add read through caching of searches to segments
 (including foreground)

---
 src/cmd/services/m3dbnode/config/cache.go     |  12 ++
 src/dbnode/integration/setup.go               |   1 +
 .../persist/fs/migration/migration_test.go    |   3 +
 src/dbnode/server/server.go                   |  54 ++++----
 src/dbnode/storage/index/block.go             |   4 +-
 .../storage/index/compaction/compactor.go     |   6 +-
 src/dbnode/storage/index/mutable_segments.go  |  25 ++--
 .../storage/index/postings_list_cache.go      | 115 +++++++++++++++---
 .../storage/index/postings_list_cache_lru.go  |  25 +++-
 .../storage/index/read_through_segment.go     |  93 ++++++++++++--
 .../postings/roaring/bitmap_multi_readonly.go |   7 ++
 src/m3ninx/search/executor/executor.go        |  12 +-
 src/m3ninx/search/executor/iterator.go        |  22 +++-
 src/m3ninx/search/query/all.go                |   6 +-
 src/m3ninx/search/query/conjunction.go        |  27 +++-
 src/m3ninx/search/query/disjunction.go        |  26 +++-
 src/m3ninx/search/query/field.go              |  26 +++-
 src/m3ninx/search/query/negation.go           |  26 +++-
 src/m3ninx/search/query/regexp.go             |  26 +++-
 src/m3ninx/search/query/term.go               |  30 ++++-
 src/m3ninx/search/query/util.go               |  24 ++--
 src/m3ninx/search/query/util_test.go          |   8 +-
 src/m3ninx/search/types.go                    |   6 +
 23 files changed, 473 insertions(+), 111 deletions(-)

diff --git a/src/cmd/services/m3dbnode/config/cache.go b/src/cmd/services/m3dbnode/config/cache.go
index 65f72dcf14..c9a4e711b6 100644
--- a/src/cmd/services/m3dbnode/config/cache.go
+++ b/src/cmd/services/m3dbnode/config/cache.go
@@ -26,6 +26,7 @@ var (
 	defaultPostingsListCacheSize   = 2 << 11 // 4096
 	defaultPostingsListCacheRegexp = true
 	defaultPostingsListCacheTerms  = true
+	defaultPostingsListCacheSearch = true
 	defaultRegexpCacheSize         = 256
 )
 
@@ -87,6 +88,7 @@ type PostingsListCacheConfiguration struct {
 	Size        *int  `yaml:"size"`
 	CacheRegexp *bool `yaml:"cacheRegexp"`
 	CacheTerms  *bool `yaml:"cacheTerms"`
+	CacheSearch *bool `yaml:"cacheSearch"`
 }
 
 // SizeOrDefault returns the provided size or the default value is none is
@@ -119,6 +121,16 @@ func (p PostingsListCacheConfiguration) CacheTermsOrDefault() bool {
 	return *p.CacheTerms
 }
 
+// CacheSearchOrDefault returns the provided cache search configuration value
+// or the default value is none is provided.
+func (p PostingsListCacheConfiguration) CacheSearchOrDefault() bool {
+	if p.CacheSearch == nil {
+		return defaultPostingsListCacheSearch
+	}
+
+	return *p.CacheSearch
+}
+
 // RegexpCacheConfiguration is a compiled regexp cache for query regexps.
 type RegexpCacheConfiguration struct {
 	Size *int `yaml:"size"`
diff --git a/src/dbnode/integration/setup.go b/src/dbnode/integration/setup.go
index b14123c4d3..1094711653 100644
--- a/src/dbnode/integration/setup.go
+++ b/src/dbnode/integration/setup.go
@@ -275,6 +275,7 @@ func NewTestSetup(
 	}
 
 	plCache, stopReporting, err := index.NewPostingsListCache(10, index.PostingsListCacheOptions{
+		PostingsListPool:  storageOpts.IndexOptions().SegmentBuilderOptions().PostingsListPool(),
 		InstrumentOptions: iOpts,
 	})
 	if err != nil {
diff --git a/src/dbnode/persist/fs/migration/migration_test.go b/src/dbnode/persist/fs/migration/migration_test.go
index 8d81f14428..7abf1a6046 100644
--- a/src/dbnode/persist/fs/migration/migration_test.go
+++ b/src/dbnode/persist/fs/migration/migration_test.go
@@ -37,6 +37,8 @@ import (
 	"github.com/m3db/m3/src/dbnode/storage"
 	"github.com/m3db/m3/src/dbnode/storage/block"
 	"github.com/m3db/m3/src/dbnode/storage/index"
+	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/x/checked"
 	"github.com/m3db/m3/src/x/ident"
 	"github.com/m3db/m3/src/x/instrument"
@@ -73,6 +75,7 @@ func TestToVersion1_1Run(t *testing.T) {
 	require.NoError(t, err)
 
 	plCache, closer, err := index.NewPostingsListCache(1, index.PostingsListCacheOptions{
+		PostingsListPool:  postings.NewPool(nil, roaring.NewPostingsList),
 		InstrumentOptions: instrument.NewOptions(),
 	})
 	defer closer()
diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index 39ed4f939b..9f640d2c0e 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -405,27 +405,6 @@ func Run(runOpts RunOptions) {
 		runtimeOpts = runtimeOpts.SetMaxWiredBlocks(lruCfg.MaxBlocks)
 	}
 
-	// Setup postings list cache.
-	var (
-		plCacheConfig  = cfg.Cache.PostingsListConfiguration()
-		plCacheSize    = plCacheConfig.SizeOrDefault()
-		plCacheOptions = index.PostingsListCacheOptions{
-			InstrumentOptions: opts.InstrumentOptions().
-				SetMetricsScope(scope.SubScope("postings-list-cache")),
-		}
-	)
-	postingsListCache, stopReporting, err := index.NewPostingsListCache(plCacheSize, plCacheOptions)
-	if err != nil {
-		logger.Fatal("could not construct postings list cache", zap.Error(err))
-	}
-	defer stopReporting()
-
-	// Setup index regexp compilation cache.
-	m3ninxindex.SetRegexpCacheOptions(m3ninxindex.RegexpCacheOptions{
-		Size:  cfg.Cache.RegexpConfiguration().SizeOrDefault(),
-		Scope: iopts.MetricsScope(),
-	})
-
 	// Setup query stats tracking.
 	docsLimit := limits.DefaultLookbackLimitOptions()
 	bytesReadLimit := limits.DefaultLookbackLimitOptions()
@@ -461,11 +440,12 @@ func Run(runOpts RunOptions) {
 			SetQueryBlockWorkerPool(queryBlockWorkerPool)
 	}
 
+	plCacheConfig := cfg.Cache.PostingsListConfiguration()
 	indexOpts = indexOpts.SetInsertMode(insertMode).
-		SetPostingsListCache(postingsListCache).
 		SetReadThroughSegmentOptions(index.ReadThroughSegmentOptions{
-			CacheRegexp: plCacheConfig.CacheRegexpOrDefault(),
-			CacheTerms:  plCacheConfig.CacheTermsOrDefault(),
+			CacheRegexp:   plCacheConfig.CacheRegexpOrDefault(),
+			CacheTerms:    plCacheConfig.CacheTermsOrDefault(),
+			CacheSearches: plCacheConfig.CacheSearchOrDefault(),
 		}).
 		SetMmapReporter(mmapReporter).
 		SetQueryLimits(queryLimits)
@@ -570,7 +550,33 @@ func Run(runOpts RunOptions) {
 		logger.Fatal("could not get pooling policy", zap.Error(err))
 	}
 
+	// Create pools.
 	opts = withEncodingAndPoolingOptions(cfg, logger, opts, poolingPolicy)
+
+	// Setup postings list cache.
+	var (
+		plCacheSize    = plCacheConfig.SizeOrDefault()
+		plCacheOptions = index.PostingsListCacheOptions{
+			InstrumentOptions: opts.InstrumentOptions().
+				SetMetricsScope(scope.SubScope("postings-list-cache")),
+		}
+	)
+	postingsListCache, stopReporting, err := index.NewPostingsListCache(plCacheSize, plCacheOptions)
+	if err != nil {
+		logger.Fatal("could not construct postings list cache", zap.Error(err))
+	}
+	defer stopReporting()
+
+	opts = opts.SetIndexOptions(opts.IndexOptions().
+		SetPostingsListCache(postingsListCache))
+
+	// Setup index regexp compilation cache.
+	m3ninxindex.SetRegexpCacheOptions(m3ninxindex.RegexpCacheOptions{
+		Size:  cfg.Cache.RegexpConfiguration().SizeOrDefault(),
+		Scope: iopts.MetricsScope(),
+	})
+
+	// Apply commit log options.
 	opts = opts.SetCommitLogOptions(opts.CommitLogOptions().
 		SetInstrumentOptions(opts.InstrumentOptions()).
 		SetFilesystemOptions(fsopts).
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index b5a46ee6a4..44175adc37 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -411,8 +411,8 @@ func (b *block) Query(
 }
 
 const (
-	queryGroupReadersParallelism = 8
-	queryGroupSize               = 8
+	queryGroupReadersParallelism = 32
+	queryGroupSize               = 32
 )
 
 type queryGroup struct {
diff --git a/src/dbnode/storage/index/compaction/compactor.go b/src/dbnode/storage/index/compaction/compactor.go
index 288d63ef35..56408ee4df 100644
--- a/src/dbnode/storage/index/compaction/compactor.go
+++ b/src/dbnode/storage/index/compaction/compactor.go
@@ -106,7 +106,7 @@ func NewCompactor(
 func (c *Compactor) Compact(
 	segs []segment.Segment,
 	reporterOptions mmap.ReporterOptions,
-) (segment.Segment, error) {
+) (fst.Segment, error) {
 	c.Lock()
 	defer c.Unlock()
 
@@ -127,7 +127,7 @@ func (c *Compactor) CompactUsingBuilder(
 	builder segment.DocumentsBuilder,
 	segs []segment.Segment,
 	reporterOptions mmap.ReporterOptions,
-) (segment.Segment, error) {
+) (fst.Segment, error) {
 	// NB(r): Ensure only single compaction happens at a time since the buffers are
 	// reused between runs.
 	c.Lock()
@@ -231,7 +231,7 @@ func (c *Compactor) CompactUsingBuilder(
 func (c *Compactor) compactFromBuilderWithLock(
 	builder segment.Builder,
 	reporterOptions mmap.ReporterOptions,
-) (segment.Segment, error) {
+) (fst.Segment, error) {
 	defer func() {
 		// Release resources regardless of result,
 		// otherwise old compacted segments are held onto
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 256bb4a313..6885266ca1 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -446,6 +446,14 @@ func (m *mutableSegments) backgroundCompactWithPlan(plan *compaction.Plan) {
 	}
 }
 
+func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment {
+	var (
+		plCache         = m.opts.PostingsListCache()
+		readThroughOpts = m.opts.ReadThroughSegmentOptions()
+	)
+	return NewReadThroughSegment(seg, plCache, readThroughOpts)
+}
+
 func (m *mutableSegments) backgroundCompactWithTask(
 	task compaction.Task,
 	log bool,
@@ -481,20 +489,14 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	// Add a read through cache for repeated expensive queries against
 	// background compacted segments since they can live for quite some
 	// time and accrue a large set of documents.
-	if immSeg, ok := compacted.(segment.ImmutableSegment); ok {
-		var (
-			plCache         = m.opts.PostingsListCache()
-			readThroughOpts = m.opts.ReadThroughSegmentOptions()
-		)
-		compacted = NewReadThroughSegment(immSeg, plCache, readThroughOpts)
-	}
+	segment := m.newReadThroughSegment(compacted)
 
 	// Rotate out the replaced frozen segments and add the compacted one.
 	m.Lock()
 	defer m.Unlock()
 
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments,
-		segments, compacted)
+		segments, segment)
 	m.backgroundSegments = result
 
 	return nil
@@ -710,12 +712,17 @@ func (m *mutableSegments) foregroundCompactWithTask(
 		return err
 	}
 
+	// Add a read through cache for repeated expensive queries against
+	// compacted segments since they can live for quite some time during
+	// block rotations while a burst of segments are created.
+	segment := m.newReadThroughSegment(compacted)
+
 	// Rotate in the ones we just compacted.
 	m.Lock()
 	defer m.Unlock()
 
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.foregroundSegments,
-		segments, compacted)
+		segments, segment)
 	m.foregroundSegments = result
 
 	return nil
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index 17d2451557..c2b2139792 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -21,14 +21,22 @@
 package index
 
 import (
+	"errors"
 	"sync"
 	"time"
 
 	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/x/instrument"
 
 	"github.com/pborman/uuid"
 	"github.com/uber-go/tally"
+	"go.uber.org/zap"
+)
+
+var (
+	errNoPostingsListPool = errors.New("no postings list pool set")
+	errInstrumentOptions  = errors.New("no instrument options set")
 )
 
 // PatternType is an enum for the various pattern types. It allows us
@@ -45,6 +53,8 @@ const (
 	PatternTypeTerm
 	// PatternTypeField indicates that the pattern is of type field.
 	PatternTypeField
+	// PatternTypeSearch indicates that the pattern is of type search.
+	PatternTypeSearch
 
 	reportLoopInterval = 10 * time.Second
 	emptyPattern       = ""
@@ -52,9 +62,21 @@ const (
 
 // PostingsListCacheOptions is the options struct for the query cache.
 type PostingsListCacheOptions struct {
+	PostingsListPool  postings.Pool
 	InstrumentOptions instrument.Options
 }
 
+// Validate will return an error if the options are not valid.
+func (o PostingsListCacheOptions) Validate() error {
+	if o.PostingsListPool == nil {
+		return errNoPostingsListPool
+	}
+	if o.InstrumentOptions == nil {
+		return errInstrumentOptions
+	}
+	return nil
+}
+
 // PostingsListCache implements an LRU for caching queries and their results.
 type PostingsListCache struct {
 	sync.Mutex
@@ -64,20 +86,30 @@ type PostingsListCache struct {
 	size    int
 	opts    PostingsListCacheOptions
 	metrics *postingsListCacheMetrics
+	logger  *zap.Logger
 }
 
 // NewPostingsListCache creates a new query cache.
-func NewPostingsListCache(size int, opts PostingsListCacheOptions) (*PostingsListCache, Closer, error) {
-	lru, err := newPostingsListLRU(size)
+func NewPostingsListCache(
+	size int,
+	opts PostingsListCacheOptions,
+) (*PostingsListCache, Closer, error) {
+	err := opts.Validate()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	lru, err := newPostingsListLRU(size, nil)
 	if err != nil {
 		return nil, nil, err
 	}
 
 	plc := &PostingsListCache{
-		lru:     lru,
 		size:    size,
+		lru:     lru,
 		opts:    opts,
 		metrics: newPostingsListCacheMetrics(opts.InstrumentOptions.MetricsScope()),
+		logger:  opts.InstrumentOptions.Logger(),
 	}
 
 	closer := plc.startReportLoop()
@@ -110,6 +142,14 @@ func (q *PostingsListCache) GetField(
 	return q.get(segmentUUID, field, emptyPattern, PatternTypeField)
 }
 
+// GetSearch returns the cached results for the provided search query, if any.
+func (q *PostingsListCache) GetSearch(
+	segmentUUID uuid.UUID,
+	query string,
+) (postings.List, bool) {
+	return q.get(segmentUUID, query, emptyPattern, PatternTypeSearch)
+}
+
 func (q *PostingsListCache) get(
 	segmentUUID uuid.UUID,
 	field string,
@@ -137,7 +177,8 @@ func (q *PostingsListCache) PutRegexp(
 	pattern string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, pattern, PatternTypeRegexp, pl)
+	q.put(segmentUUID, field, pattern, PatternTypeRegexp, pl,
+		postingsListMetadata{})
 }
 
 // PutTerm updates the LRU with the result of the term query.
@@ -147,7 +188,8 @@ func (q *PostingsListCache) PutTerm(
 	pattern string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, pattern, PatternTypeTerm, pl)
+	q.put(segmentUUID, field, pattern, PatternTypeTerm, pl,
+		postingsListMetadata{})
 }
 
 // PutField updates the LRU with the result of the field query.
@@ -156,7 +198,33 @@ func (q *PostingsListCache) PutField(
 	field string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, emptyPattern, PatternTypeField, pl)
+	q.put(segmentUUID, field, emptyPattern, PatternTypeField, pl,
+		postingsListMetadata{})
+}
+
+// PutSearch updates the LRU with the result of a search query.
+func (q *PostingsListCache) PutSearch(
+	segmentUUID uuid.UUID,
+	query string,
+	pl postings.List,
+) {
+	pooled := false
+	if roaring.IsReadOnlyPostingsList(pl) {
+		// Copy into mutable postings list since it's expensive to read from
+		// a read only postings list over and over again (it's lazily
+		// evaluated from for allocation purposes).
+		pooled = true
+		mutable := q.opts.PostingsListPool.Get()
+		if err := mutable.AddIterator(pl.Iterator()); err != nil {
+			q.metrics.pooledGetErrAddIter.Inc(1)
+			q.logger.Error("unable to add postings iter", zap.Error(err))
+			return
+		}
+		pl = mutable
+	}
+
+	q.put(segmentUUID, query, emptyPattern, PatternTypeSearch, pl,
+		postingsListMetadata{Pooled: pooled})
 }
 
 func (q *PostingsListCache) put(
@@ -165,15 +233,10 @@ func (q *PostingsListCache) put(
 	pattern string,
 	patternType PatternType,
 	pl postings.List,
+	meta postingsListMetadata,
 ) {
 	q.Lock()
-	q.lru.Add(
-		segmentUUID,
-		field,
-		pattern,
-		patternType,
-		pl,
-	)
+	q.lru.Add(segmentUUID, field, pattern, patternType, pl, meta)
 	q.Unlock()
 	q.emitCachePutMetrics(patternType)
 }
@@ -233,6 +296,8 @@ func (q *PostingsListCache) emitCacheGetMetrics(patternType PatternType, hit boo
 		method = q.metrics.term
 	case PatternTypeField:
 		method = q.metrics.field
+	case PatternTypeSearch:
+		method = q.metrics.search
 	default:
 		method = q.metrics.unknown // should never happen
 	}
@@ -251,6 +316,8 @@ func (q *PostingsListCache) emitCachePutMetrics(patternType PatternType) {
 		q.metrics.term.puts.Inc(1)
 	case PatternTypeField:
 		q.metrics.field.puts.Inc(1)
+	case PatternTypeSearch:
+		q.metrics.search.puts.Inc(1)
 	default:
 		q.metrics.unknown.puts.Inc(1) // should never happen
 	}
@@ -260,10 +327,16 @@ type postingsListCacheMetrics struct {
 	regexp  *postingsListCacheMethodMetrics
 	term    *postingsListCacheMethodMetrics
 	field   *postingsListCacheMethodMetrics
+	search  *postingsListCacheMethodMetrics
 	unknown *postingsListCacheMethodMetrics
 
 	size     tally.Gauge
 	capacity tally.Gauge
+
+	pooledGet              tally.Counter
+	pooledGetErrAddIter    tally.Counter
+	pooledPut              tally.Counter
+	pooledPutErrNotMutable tally.Counter
 }
 
 func newPostingsListCacheMetrics(scope tally.Scope) *postingsListCacheMetrics {
@@ -277,12 +350,22 @@ func newPostingsListCacheMetrics(scope tally.Scope) *postingsListCacheMetrics {
 		field: newPostingsListCacheMethodMetrics(scope.Tagged(map[string]string{
 			"query_type": "field",
 		})),
+		search: newPostingsListCacheMethodMetrics(scope.Tagged(map[string]string{
+			"query_type": "search",
+		})),
 		unknown: newPostingsListCacheMethodMetrics(scope.Tagged(map[string]string{
 			"query_type": "unknown",
 		})),
-
-		size:     scope.Gauge("size"),
-		capacity: scope.Gauge("capacity"),
+		size:      scope.Gauge("size"),
+		capacity:  scope.Gauge("capacity"),
+		pooledGet: scope.Counter("pooled_get"),
+		pooledGetErrAddIter: scope.Tagged(map[string]string{
+			"error_type": "add_iter",
+		}).Counter("pooled_get_error"),
+		pooledPut: scope.Counter("pooled_put"),
+		pooledPutErrNotMutable: scope.Tagged(map[string]string{
+			"error_type": "not_mutable",
+		}).Counter("pooled_put_error"),
 	}
 }
 
diff --git a/src/dbnode/storage/index/postings_list_cache_lru.go b/src/dbnode/storage/index/postings_list_cache_lru.go
index b7c9d9be4f..ed27ac42b0 100644
--- a/src/dbnode/storage/index/postings_list_cache_lru.go
+++ b/src/dbnode/storage/index/postings_list_cache_lru.go
@@ -62,6 +62,14 @@ type postingsListLRU struct {
 	size      int
 	evictList *list.List
 	items     map[uuid.Array]map[key]*list.Element
+	onRemove  onRemoveFn
+}
+
+type onRemoveFn func(pl postings.List, metadata postingsListMetadata)
+
+// postingsListMetadata is metadata about the postings list.
+type postingsListMetadata struct {
+	Pooled bool
 }
 
 // entry is used to hold a value in the evictList.
@@ -69,6 +77,7 @@ type entry struct {
 	uuid         uuid.UUID
 	key          key
 	postingsList postings.List
+	metadata     postingsListMetadata
 }
 
 type key struct {
@@ -78,7 +87,10 @@ type key struct {
 }
 
 // newPostingsListLRU constructs an LRU of the given size.
-func newPostingsListLRU(size int) (*postingsListLRU, error) {
+func newPostingsListLRU(
+	size int,
+	onRemove onRemoveFn,
+) (*postingsListLRU, error) {
 	if size <= 0 {
 		return nil, errors.New("Must provide a positive size")
 	}
@@ -87,6 +99,7 @@ func newPostingsListLRU(size int) (*postingsListLRU, error) {
 		size:      size,
 		evictList: list.New(),
 		items:     make(map[uuid.Array]map[key]*list.Element),
+		onRemove:  onRemove,
 	}, nil
 }
 
@@ -97,6 +110,7 @@ func (c *postingsListLRU) Add(
 	pattern string,
 	patternType PatternType,
 	pl postings.List,
+	metadata postingsListMetadata,
 ) (evicted bool) {
 	newKey := newKey(field, pattern, patternType)
 	// Check for existing item.
@@ -108,7 +122,9 @@ func (c *postingsListLRU) Add(
 			// can only point to one entry at a time and we use them for purges. Also,
 			// it saves space by avoiding storing duplicate values.
 			c.evictList.MoveToFront(ent)
-			ent.Value.(*entry).postingsList = pl
+			e := ent.Value.(*entry)
+			e.postingsList = pl
+			e.metadata = metadata
 			return false
 		}
 	}
@@ -119,6 +135,7 @@ func (c *postingsListLRU) Add(
 			uuid:         segmentUUID,
 			key:          newKey,
 			postingsList: pl,
+			metadata:     metadata,
 		}
 		entry = c.evictList.PushFront(ent)
 	)
@@ -209,6 +226,10 @@ func (c *postingsListLRU) removeElement(e *list.Element) {
 			delete(c.items, entry.uuid.Array())
 		}
 	}
+
+	if c.onRemove != nil {
+		c.onRemove(entry.postingsList, entry.metadata)
+	}
 }
 
 func newKey(field, pattern string, patternType PatternType) key {
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index b234ba275b..60c65e62bd 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -28,10 +28,13 @@ import (
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/search"
 
 	"github.com/pborman/uuid"
 )
 
+const maxUniqueQueryCount = 2 << 14 // 32k
+
 var (
 	errCantGetReaderFromClosedSegment = errors.New("cant get reader from closed segment")
 	errCantCloseClosedSegment         = errors.New("cant close closed segment")
@@ -58,16 +61,28 @@ type ReadThroughSegment struct {
 
 	opts ReadThroughSegmentOptions
 
+	searches readThroughSegmentSearches
+
 	closed bool
 }
 
+type readThroughSegmentSearches struct {
+	sync.RWMutex
+	queries map[string]int
+}
+
 // ReadThroughSegmentOptions is the options struct for the
 // ReadThroughSegment.
 type ReadThroughSegmentOptions struct {
-	// Whether the postings list for regexp queries should be cached.
+	// CacheRegexp sets whether the postings list for regexp queries
+	// should be cached.
 	CacheRegexp bool
-	// Whether the postings list for term queries should be cached.
+	// CacheTerms sets whether the postings list for term queries
+	// should be cached.
 	CacheTerms bool
+	// CacheSearches sets whether the postings list for search queries
+	// should be cached.
+	CacheSearches bool
 }
 
 // NewReadThroughSegment creates a new read through segment.
@@ -81,6 +96,9 @@ func NewReadThroughSegment(
 		opts:              opts,
 		uuid:              uuid.NewUUID(),
 		postingsListCache: cache,
+		searches: readThroughSegmentSearches{
+			queries: make(map[string]int),
+		},
 	}
 }
 
@@ -96,8 +114,8 @@ func (r *ReadThroughSegment) Reader() (segment.Reader, error) {
 	if err != nil {
 		return nil, err
 	}
-	return newReadThroughSegmentReader(
-		reader, r.uuid, r.postingsListCache, r.opts), nil
+	return newReadThroughSegmentReader(r, reader, r.uuid,
+		r.postingsListCache, r.opts), nil
 }
 
 // Close purges all entries in the cache associated with this segment,
@@ -155,7 +173,10 @@ func (r *ReadThroughSegment) Size() int64 {
 	return r.segment.Size()
 }
 
+var _ search.ReadThroughSegmentSearcher = (*readThroughSegmentReader)(nil)
+
 type readThroughSegmentReader struct {
+	seg *ReadThroughSegment
 	// reader is explicitly not embedded at the top level
 	// of the struct to force new methods added to index.Reader
 	// to be explicitly supported by the read through cache.
@@ -166,16 +187,17 @@ type readThroughSegmentReader struct {
 }
 
 func newReadThroughSegmentReader(
+	seg *ReadThroughSegment,
 	reader segment.Reader,
 	uuid uuid.UUID,
 	cache *PostingsListCache,
 	opts ReadThroughSegmentOptions,
 ) segment.Reader {
 	return &readThroughSegmentReader{
-		reader:            reader,
-		opts:              opts,
-		uuid:              uuid,
-		postingsListCache: cache,
+		seg:    seg,
+		reader: reader,
+		opts:   opts,
+		uuid:   uuid,
 	}
 }
 
@@ -301,3 +323,58 @@ func (s *readThroughSegmentReader) Terms(field []byte) (segment.TermsIterator, e
 func (s *readThroughSegmentReader) Close() error {
 	return s.reader.Close()
 }
+
+func (s *readThroughSegmentReader) Search(
+	query search.Query,
+	searcher search.Searcher,
+) (postings.List, error) {
+	if s.postingsListCache == nil || !s.opts.CacheSearches {
+		return searcher.Search(s)
+	}
+
+	// TODO(r): Would be nice to not allocate strings here.
+	queryStr := query.String()
+	pl, ok := s.postingsListCache.GetSearch(s.uuid, queryStr)
+	if ok {
+		return pl, nil
+	}
+
+	pl, err := searcher.Search(s)
+	if err != nil {
+		return nil, err
+	}
+
+	s.seg.searches.Lock()
+	count := 1
+	curr, ok := s.seg.searches.queries[queryStr]
+	if !ok {
+		if len(s.seg.searches.queries) >= maxUniqueQueryCount {
+			// Delete a random key to make room.
+			for k := range s.seg.searches.queries {
+				delete(s.seg.searches.queries, k)
+				break // Immediately break.
+			}
+			s.seg.searches.queries[queryStr] = count
+		}
+	} else {
+		count = curr + 1
+	}
+	willCache := count > 1
+	if willCache {
+		// Delete out of the seen query count.
+		delete(s.seg.searches.queries, queryStr)
+	} else {
+		// Update seen count.
+		s.seg.searches.queries[queryStr] = count
+	}
+	s.seg.searches.Unlock()
+
+	if willCache {
+		// Only cache the second time seen a recent query since
+		// copying the postings lists into a roaring postings list
+		//
+		s.postingsListCache.PutSearch(s.uuid, queryStr, pl)
+	}
+
+	return pl, nil
+}
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 2f60fedae3..4355801b6a 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -78,6 +78,13 @@ func IntersectAndNegateReadOnly(
 	})
 }
 
+// IsReadOnlyPostingsList returns whether a postings list is read only
+// or not.
+func IsReadOnlyPostingsList(pl postings.List) bool {
+	_, ok := pl.(readOnlyIterable)
+	return ok
+}
+
 // ReadOnlyBitmapIntersectCheck is a check that can be repeated
 // against read only bitmaps without allocations.
 type ReadOnlyBitmapIntersectCheck struct {
diff --git a/src/m3ninx/search/executor/executor.go b/src/m3ninx/search/executor/executor.go
index e5f606a82f..ae29577436 100644
--- a/src/m3ninx/search/executor/executor.go
+++ b/src/m3ninx/search/executor/executor.go
@@ -33,7 +33,10 @@ var (
 	errExecutorClosed = errors.New("executor is closed")
 )
 
-type newIteratorFn func(s search.Searcher, rs index.Readers) (doc.Iterator, error)
+type newIteratorFn func(
+	q search.Query,
+	rs index.Readers,
+) (doc.Iterator, error)
 
 type executor struct {
 	sync.RWMutex
@@ -59,12 +62,7 @@ func (e *executor) Execute(q search.Query) (doc.Iterator, error) {
 		return nil, errExecutorClosed
 	}
 
-	s, err := q.Searcher()
-	if err != nil {
-		return nil, err
-	}
-
-	iter, err := e.newIteratorFn(s, e.readers)
+	iter, err := e.newIteratorFn(q, e.readers)
 	if err != nil {
 		return nil, err
 	}
diff --git a/src/m3ninx/search/executor/iterator.go b/src/m3ninx/search/executor/iterator.go
index 3be5506a09..7295e8e46d 100644
--- a/src/m3ninx/search/executor/iterator.go
+++ b/src/m3ninx/search/executor/iterator.go
@@ -23,10 +23,12 @@ package executor
 import (
 	"github.com/m3db/m3/src/m3ninx/doc"
 	"github.com/m3db/m3/src/m3ninx/index"
+	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/search"
 )
 
 type iterator struct {
+	query    search.Query
 	searcher search.Searcher
 	readers  index.Readers
 
@@ -38,8 +40,14 @@ type iterator struct {
 	closed bool
 }
 
-func newIterator(s search.Searcher, rs index.Readers) (doc.Iterator, error) {
+func newIterator(q search.Query, rs index.Readers) (doc.Iterator, error) {
+	s, err := q.Searcher()
+	if err != nil {
+		return nil, err
+	}
+
 	it := &iterator{
+		query:    q,
 		searcher: s,
 		readers:  rs,
 		idx:      -1,
@@ -116,8 +124,16 @@ func (it *iterator) nextIter() (doc.Iterator, bool, error) {
 		return nil, false, nil
 	}
 
-	reader := it.readers[it.idx]
-	pl, err := it.searcher.Search(reader)
+	var (
+		reader = it.readers[it.idx]
+		pl     postings.List
+		err    error
+	)
+	if readThrough, ok := reader.(search.ReadThroughSegmentSearcher); ok {
+		pl, err = readThrough.Search(it.query, it.searcher)
+	} else {
+		pl, err = it.searcher.Search(reader)
+	}
 	if err != nil {
 		return nil, false, err
 	}
diff --git a/src/m3ninx/search/query/all.go b/src/m3ninx/search/query/all.go
index 693662f870..dceb914c52 100644
--- a/src/m3ninx/search/query/all.go
+++ b/src/m3ninx/search/query/all.go
@@ -21,13 +21,13 @@
 package query
 
 import (
-	"fmt"
-
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
 	"github.com/m3db/m3/src/m3ninx/search/searcher"
 )
 
+const allQueryStr = "all()"
+
 // AllQuery returns a query which matches all known documents.
 type AllQuery struct{}
 
@@ -62,5 +62,5 @@ func (q *AllQuery) ToProto() *querypb.Query {
 }
 
 func (q *AllQuery) String() string {
-	return fmt.Sprintf("all()")
+	return allQueryStr
 }
diff --git a/src/m3ninx/search/query/conjunction.go b/src/m3ninx/search/query/conjunction.go
index 0b906e4ad6..d56d32afda 100644
--- a/src/m3ninx/search/query/conjunction.go
+++ b/src/m3ninx/search/query/conjunction.go
@@ -21,7 +21,8 @@
 package query
 
 import (
-	"fmt"
+	"strings"
+	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -30,6 +31,8 @@ import (
 
 // ConjuctionQuery finds documents which match at least one of the given queries.
 type ConjuctionQuery struct {
+	sync.Mutex
+	strValue  string
 	queries   []search.Query
 	negations []search.Query
 }
@@ -151,10 +154,26 @@ func (q *ConjuctionQuery) ToProto() *querypb.Query {
 }
 
 func (q *ConjuctionQuery) String() string {
+	q.Lock()
+	str := q.stringWithLock()
+	q.Unlock()
+	return str
+}
+
+func (q *ConjuctionQuery) stringWithLock() string {
+	if q.strValue != "" {
+		return q.strValue
+	}
+
+	var str strings.Builder
+	str.WriteString("conjunction(")
+	join(&str, q.queries)
 	if len(q.negations) > 0 {
-		return fmt.Sprintf("conjunction(%s,%s)",
-			join(q.queries), joinNegation(q.negations))
+		str.WriteRune(',')
+		joinNegation(&str, q.negations)
 	}
+	str.WriteRune(')')
 
-	return fmt.Sprintf("conjunction(%s)", join(q.queries))
+	q.strValue = str.String()
+	return q.strValue
 }
diff --git a/src/m3ninx/search/query/disjunction.go b/src/m3ninx/search/query/disjunction.go
index 6904c3b994..fe5905841d 100644
--- a/src/m3ninx/search/query/disjunction.go
+++ b/src/m3ninx/search/query/disjunction.go
@@ -21,7 +21,8 @@
 package query
 
 import (
-	"fmt"
+	"strings"
+	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -30,7 +31,9 @@ import (
 
 // DisjuctionQuery finds documents which match at least one of the given queries.
 type DisjuctionQuery struct {
-	queries []search.Query
+	sync.Mutex
+	strValue string
+	queries  []search.Query
 }
 
 // NewDisjunctionQuery constructs a new query which matches documents that match any
@@ -112,5 +115,22 @@ func (q *DisjuctionQuery) ToProto() *querypb.Query {
 }
 
 func (q *DisjuctionQuery) String() string {
-	return fmt.Sprintf("disjunction(%s)", join(q.queries))
+	q.Lock()
+	str := q.stringWithLock()
+	q.Unlock()
+	return str
+}
+
+func (q *DisjuctionQuery) stringWithLock() string {
+	if q.strValue != "" {
+		return q.strValue
+	}
+
+	var str strings.Builder
+	str.WriteString("disjunction(")
+	join(&str, q.queries)
+	str.WriteRune(')')
+
+	q.strValue = str.String()
+	return q.strValue
 }
diff --git a/src/m3ninx/search/query/field.go b/src/m3ninx/search/query/field.go
index 882f6bae91..a5da0fa671 100644
--- a/src/m3ninx/search/query/field.go
+++ b/src/m3ninx/search/query/field.go
@@ -22,7 +22,8 @@ package query
 
 import (
 	"bytes"
-	"fmt"
+	"strings"
+	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -31,7 +32,9 @@ import (
 
 // FieldQuery finds document which have the given field exactly.
 type FieldQuery struct {
-	field []byte
+	sync.Mutex
+	strValue string
+	field    []byte
 }
 
 // NewFieldQuery constructs a new FieldQuery for the given field.
@@ -78,5 +81,22 @@ func (q *FieldQuery) ToProto() *querypb.Query {
 }
 
 func (q *FieldQuery) String() string {
-	return fmt.Sprintf("field(%s)", q.field)
+	q.Lock()
+	str := q.stringWithLock()
+	q.Unlock()
+	return str
+}
+
+func (q *FieldQuery) stringWithLock() string {
+	if q.strValue != "" {
+		return q.strValue
+	}
+
+	var str strings.Builder
+	str.WriteString("field(")
+	str.Write(q.field)
+	str.WriteRune(')')
+
+	q.strValue = str.String()
+	return q.strValue
 }
diff --git a/src/m3ninx/search/query/negation.go b/src/m3ninx/search/query/negation.go
index c4863dac6d..3dff751724 100644
--- a/src/m3ninx/search/query/negation.go
+++ b/src/m3ninx/search/query/negation.go
@@ -21,7 +21,8 @@
 package query
 
 import (
-	"fmt"
+	"strings"
+	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -30,7 +31,9 @@ import (
 
 // NegationQuery finds document which do not match a given query.
 type NegationQuery struct {
-	query search.Query
+	sync.Mutex
+	strValue string
+	query    search.Query
 }
 
 // NewNegationQuery constructs a new NegationQuery for the given query.
@@ -75,5 +78,22 @@ func (q *NegationQuery) ToProto() *querypb.Query {
 }
 
 func (q *NegationQuery) String() string {
-	return fmt.Sprintf("negation(%s)", q.query)
+	q.Lock()
+	str := q.stringWithLock()
+	q.Unlock()
+	return str
+}
+
+func (q *NegationQuery) stringWithLock() string {
+	if q.strValue != "" {
+		return q.strValue
+	}
+
+	var str strings.Builder
+	str.WriteString("negation(")
+	str.WriteString(q.query.String())
+	str.WriteRune(')')
+
+	q.strValue = str.String()
+	return q.strValue
 }
diff --git a/src/m3ninx/search/query/regexp.go b/src/m3ninx/search/query/regexp.go
index 475c4590c8..6a8185c16f 100644
--- a/src/m3ninx/search/query/regexp.go
+++ b/src/m3ninx/search/query/regexp.go
@@ -22,7 +22,8 @@ package query
 
 import (
 	"bytes"
-	"fmt"
+	"strings"
+	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/index"
@@ -32,6 +33,8 @@ import (
 
 // RegexpQuery finds documents which match the given regular expression.
 type RegexpQuery struct {
+	sync.Mutex
+	strValue string
 	field    []byte
 	regexp   []byte
 	compiled index.CompiledRegex
@@ -93,5 +96,24 @@ func (q *RegexpQuery) ToProto() *querypb.Query {
 }
 
 func (q *RegexpQuery) String() string {
-	return fmt.Sprintf("regexp(%s, %s)", q.field, q.regexp)
+	q.Lock()
+	str := q.stringWithLock()
+	q.Unlock()
+	return str
+}
+
+func (q *RegexpQuery) stringWithLock() string {
+	if q.strValue != "" {
+		return q.strValue
+	}
+
+	var str strings.Builder
+	str.WriteString("regexp(")
+	str.Write(q.field)
+	str.WriteRune(',')
+	str.Write(q.regexp)
+	str.WriteRune(')')
+
+	q.strValue = str.String()
+	return q.strValue
 }
diff --git a/src/m3ninx/search/query/term.go b/src/m3ninx/search/query/term.go
index 6783fe968d..3488d2d0a6 100644
--- a/src/m3ninx/search/query/term.go
+++ b/src/m3ninx/search/query/term.go
@@ -22,7 +22,8 @@ package query
 
 import (
 	"bytes"
-	"fmt"
+	"strings"
+	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -31,8 +32,10 @@ import (
 
 // TermQuery finds document which match the given term exactly.
 type TermQuery struct {
-	field []byte
-	term  []byte
+	sync.Mutex
+	strValue string
+	field    []byte
+	term     []byte
 }
 
 // NewTermQuery constructs a new TermQuery for the given field and term.
@@ -76,5 +79,24 @@ func (q *TermQuery) ToProto() *querypb.Query {
 }
 
 func (q *TermQuery) String() string {
-	return fmt.Sprintf("term(%s, %s)", q.field, q.term)
+	q.Lock()
+	str := q.stringWithLock()
+	q.Unlock()
+	return str
+}
+
+func (q *TermQuery) stringWithLock() string {
+	if q.strValue != "" {
+		return q.strValue
+	}
+
+	var str strings.Builder
+	str.WriteString("term(")
+	str.Write(q.field)
+	str.WriteRune(',')
+	str.Write(q.term)
+	str.WriteRune(')')
+
+	q.strValue = str.String()
+	return q.strValue
 }
diff --git a/src/m3ninx/search/query/util.go b/src/m3ninx/search/query/util.go
index f162e12cc7..34d157ff5c 100644
--- a/src/m3ninx/search/query/util.go
+++ b/src/m3ninx/search/query/util.go
@@ -21,8 +21,7 @@
 package query
 
 import (
-	"bytes"
-	"fmt"
+	"strings"
 
 	"github.com/m3db/m3/src/m3ninx/search"
 )
@@ -53,34 +52,34 @@ func singular(q search.Query) (search.Query, bool) {
 }
 
 // join concatenates a slice of queries.
-func join(qs []search.Query) string {
+func join(b *strings.Builder, qs []search.Query) {
 	switch len(qs) {
 	case 0:
-		return ""
+		return
 	case 1:
-		return qs[0].String()
+		b.WriteString(qs[0].String())
+		return
 	}
 
-	var b bytes.Buffer
 	b.WriteString(qs[0].String())
 	for _, q := range qs[1:] {
 		b.WriteString(separator)
 		b.WriteString(q.String())
 	}
-
-	return b.String()
 }
 
 // joinNegation concatenates a slice of negated queries.
-func joinNegation(qs []search.Query) string {
+func joinNegation(b *strings.Builder, qs []search.Query) {
 	switch len(qs) {
 	case 0:
-		return ""
+		return
 	case 1:
-		return fmt.Sprintf("%s%s%s", negationPrefix, qs[0].String(), negationPostfix)
+		b.WriteString(negationPrefix)
+		b.WriteString(qs[0].String())
+		b.WriteString(negationPostfix)
+		return
 	}
 
-	var b bytes.Buffer
 	b.WriteString(negationPrefix)
 	b.WriteString(qs[0].String())
 	for _, q := range qs[1:] {
@@ -89,5 +88,4 @@ func joinNegation(qs []search.Query) string {
 	}
 
 	b.WriteString(negationPostfix)
-	return b.String()
 }
diff --git a/src/m3ninx/search/query/util_test.go b/src/m3ninx/search/query/util_test.go
index 6159b96bc8..56915c410f 100644
--- a/src/m3ninx/search/query/util_test.go
+++ b/src/m3ninx/search/query/util_test.go
@@ -21,6 +21,7 @@
 package query
 
 import (
+	"strings"
 	"testing"
 
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -64,8 +65,11 @@ func TestJoin(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			require.Equal(t, test.expected, join(test.input))
-			require.Equal(t, test.expectedNegation, joinNegation(test.input))
+			var j, jn strings.Builder
+			join(&j, test.input)
+			joinNegation(&jn, test.input)
+			require.Equal(t, test.expected, j.String())
+			require.Equal(t, test.expectedNegation, jn.String())
 		})
 	}
 }
diff --git a/src/m3ninx/search/types.go b/src/m3ninx/search/types.go
index d4fc76b4ab..6800087990 100644
--- a/src/m3ninx/search/types.go
+++ b/src/m3ninx/search/types.go
@@ -61,3 +61,9 @@ type Searcher interface {
 
 // Searchers is a slice of Searcher.
 type Searchers []Searcher
+
+// ReadThroughSegmentSearcher searches a read through segment
+// and potentially caches the result.
+type ReadThroughSegmentSearcher interface {
+	Search(query Query, searcher Searcher) (postings.List, error)
+}

From 31d581644ad0a15fbcd53b239d733a603027eaf0 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 14 Nov 2020 00:34:55 -0500
Subject: [PATCH 033/106] Fix typo

---
 src/dbnode/storage/index/read_through_segment.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index 60c65e62bd..3ca60cbc19 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -372,7 +372,7 @@ func (s *readThroughSegmentReader) Search(
 	if willCache {
 		// Only cache the second time seen a recent query since
 		// copying the postings lists into a roaring postings list
-		//
+		// can be expensive (in PutSearch).
 		s.postingsListCache.PutSearch(s.uuid, queryStr, pl)
 	}
 

From 7da3dd5419b6892312fa455ab1a4340b465987ab Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 14 Nov 2020 19:15:08 -0500
Subject: [PATCH 034/106] Set postings list pooling

---
 src/dbnode/server/server.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index 7d8b9f9d29..2990a1bd80 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -558,6 +558,7 @@ func Run(runOpts RunOptions) {
 	var (
 		plCacheSize    = plCacheConfig.SizeOrDefault()
 		plCacheOptions = index.PostingsListCacheOptions{
+			PostingsListPool: opts.IndexOptions().SegmentBuilderOptions().PostingsListPool(),
 			InstrumentOptions: opts.InstrumentOptions().
 				SetMetricsScope(scope.SubScope("postings-list-cache")),
 		}

From 1084aa832a45613da74d780198fcda6f985e9195 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 16 Nov 2020 16:57:02 -0500
Subject: [PATCH 035/106] Propagate postings list cache

---
 src/dbnode/storage/index/read_through_segment.go | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index 3ca60cbc19..7b9af90b6a 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -194,10 +194,11 @@ func newReadThroughSegmentReader(
 	opts ReadThroughSegmentOptions,
 ) segment.Reader {
 	return &readThroughSegmentReader{
-		seg:    seg,
-		reader: reader,
-		opts:   opts,
-		uuid:   uuid,
+		seg:               seg,
+		reader:            reader,
+		opts:              opts,
+		uuid:              uuid,
+		postingsListCache: cache,
 	}
 }
 

From 03203b71274a0f9f8baa97e41762f347dc4d1c6d Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 16 Nov 2020 17:46:32 -0500
Subject: [PATCH 036/106] Use independent postings list cache for searches

---
 src/dbnode/server/server.go                   |  15 ++-
 src/dbnode/storage/index/block.go             |   7 +-
 src/dbnode/storage/index/mutable_segments.go  |   7 +-
 src/dbnode/storage/index/options.go           |  11 ++
 .../storage/index/read_through_segment.go     | 113 +++++++++++-------
 src/dbnode/storage/index/types.go             |   6 +
 6 files changed, 111 insertions(+), 48 deletions(-)

diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index 2990a1bd80..fb90ab2e19 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -563,14 +563,21 @@ func Run(runOpts RunOptions) {
 				SetMetricsScope(scope.SubScope("postings-list-cache")),
 		}
 	)
-	postingsListCache, stopReporting, err := index.NewPostingsListCache(plCacheSize, plCacheOptions)
+	segmentPostingsListCache, segmentStopReporting, err := index.NewPostingsListCache(plCacheSize, plCacheOptions)
 	if err != nil {
-		logger.Fatal("could not construct postings list cache", zap.Error(err))
+		logger.Fatal("could not construct segment postings list cache", zap.Error(err))
 	}
-	defer stopReporting()
+	defer segmentStopReporting()
+
+	searchPostingsListCache, searchStopReporting, err := index.NewPostingsListCache(plCacheSize, plCacheOptions)
+	if err != nil {
+		logger.Fatal("could not construct searches postings list cache", zap.Error(err))
+	}
+	defer searchStopReporting()
 
 	opts = opts.SetIndexOptions(opts.IndexOptions().
-		SetPostingsListCache(postingsListCache))
+		SetPostingsListCache(segmentPostingsListCache).
+		SetSearchPostingsListCache(searchPostingsListCache))
 
 	// Setup index regexp compilation cache.
 	m3ninxindex.SetRegexpCacheOptions(m3ninxindex.RegexpCacheOptions{
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index 44175adc37..b6c3b8b03b 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -958,7 +958,10 @@ func (b *block) addResults(
 	}
 
 	var (
-		plCache         = b.opts.PostingsListCache()
+		plCaches = ReadThroughSegmentCaches{
+			SegmentPostingsListCache: b.opts.PostingsListCache(),
+			SearchPostingsListCache:  b.opts.SearchPostingsListCache(),
+		}
 		readThroughOpts = b.opts.ReadThroughSegmentOptions()
 		segments        = results.Segments()
 	)
@@ -967,7 +970,7 @@ func (b *block) addResults(
 		elem := seg.Segment()
 		if immSeg, ok := elem.(segment.ImmutableSegment); ok {
 			// only wrap the immutable segments with a read through cache.
-			elem = NewReadThroughSegment(immSeg, plCache, readThroughOpts)
+			elem = NewReadThroughSegment(immSeg, plCaches, readThroughOpts)
 		}
 		readThroughSegments = append(readThroughSegments, elem)
 	}
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 6885266ca1..c8973cc1f4 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -448,10 +448,13 @@ func (m *mutableSegments) backgroundCompactWithPlan(plan *compaction.Plan) {
 
 func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment {
 	var (
-		plCache         = m.opts.PostingsListCache()
+		plCaches = ReadThroughSegmentCaches{
+			SegmentPostingsListCache: m.opts.PostingsListCache(),
+			SearchPostingsListCache:  m.opts.SearchPostingsListCache(),
+		}
 		readThroughOpts = m.opts.ReadThroughSegmentOptions()
 	)
-	return NewReadThroughSegment(seg, plCache, readThroughOpts)
+	return NewReadThroughSegment(seg, plCaches, readThroughOpts)
 }
 
 func (m *mutableSegments) backgroundCompactWithTask(
diff --git a/src/dbnode/storage/index/options.go b/src/dbnode/storage/index/options.go
index e27f8d729e..d23e7e0a61 100644
--- a/src/dbnode/storage/index/options.go
+++ b/src/dbnode/storage/index/options.go
@@ -125,6 +125,7 @@ type opts struct {
 	foregroundCompactionPlannerOpts compaction.PlannerOptions
 	backgroundCompactionPlannerOpts compaction.PlannerOptions
 	postingsListCache               *PostingsListCache
+	searchPostingsListCache         *PostingsListCache
 	readThroughSegmentOptions       ReadThroughSegmentOptions
 	mmapReporter                    mmap.Reporter
 	queryLimits                     limits.QueryLimits
@@ -394,6 +395,16 @@ func (o *opts) SetPostingsListCache(value *PostingsListCache) Options {
 }
 
 func (o *opts) PostingsListCache() *PostingsListCache {
+	return o.searchPostingsListCache
+}
+
+func (o *opts) SetSearchPostingsListCache(value *PostingsListCache) Options {
+	opts := *o
+	opts.searchPostingsListCache = value
+	return &opts
+}
+
+func (o *opts) SearchPostingsListCache() *PostingsListCache {
 	return o.postingsListCache
 }
 
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index 7b9af90b6a..ad5e0c5ae7 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -33,8 +33,6 @@ import (
 	"github.com/pborman/uuid"
 )
 
-const maxUniqueQueryCount = 2 << 14 // 32k
-
 var (
 	errCantGetReaderFromClosedSegment = errors.New("cant get reader from closed segment")
 	errCantCloseClosedSegment         = errors.New("cant close closed segment")
@@ -56,8 +54,8 @@ type ReadThroughSegment struct {
 
 	segment segment.ImmutableSegment
 
-	uuid              uuid.UUID
-	postingsListCache *PostingsListCache
+	uuid   uuid.UUID
+	caches ReadThroughSegmentCaches
 
 	opts ReadThroughSegmentOptions
 
@@ -71,6 +69,13 @@ type readThroughSegmentSearches struct {
 	queries map[string]int
 }
 
+// ReadThroughSegmentCaches is the set of caches
+// to use for the read through segment.
+type ReadThroughSegmentCaches struct {
+	SegmentPostingsListCache *PostingsListCache
+	SearchPostingsListCache  *PostingsListCache
+}
+
 // ReadThroughSegmentOptions is the options struct for the
 // ReadThroughSegment.
 type ReadThroughSegmentOptions struct {
@@ -88,14 +93,14 @@ type ReadThroughSegmentOptions struct {
 // NewReadThroughSegment creates a new read through segment.
 func NewReadThroughSegment(
 	seg segment.ImmutableSegment,
-	cache *PostingsListCache,
+	caches ReadThroughSegmentCaches,
 	opts ReadThroughSegmentOptions,
 ) segment.Segment {
 	return &ReadThroughSegment{
-		segment:           seg,
-		opts:              opts,
-		uuid:              uuid.NewUUID(),
-		postingsListCache: cache,
+		segment: seg,
+		opts:    opts,
+		uuid:    uuid.NewUUID(),
+		caches:  caches,
 		searches: readThroughSegmentSearches{
 			queries: make(map[string]int),
 		},
@@ -114,8 +119,7 @@ func (r *ReadThroughSegment) Reader() (segment.Reader, error) {
 	if err != nil {
 		return nil, err
 	}
-	return newReadThroughSegmentReader(r, reader, r.uuid,
-		r.postingsListCache, r.opts), nil
+	return newReadThroughSegmentReader(r, reader, r.uuid, r.caches, r.opts), nil
 }
 
 // Close purges all entries in the cache associated with this segment,
@@ -129,15 +133,40 @@ func (r *ReadThroughSegment) Close() error {
 
 	r.closed = true
 
-	if r.postingsListCache != nil {
-		// Purge segments from the cache before closing the segment to avoid
-		// temporarily having postings lists in the cache whose underlying
-		// bytes are no longer mmap'd.
-		r.postingsListCache.PurgeSegment(r.uuid)
-	}
+	// Purge segments from the cache before closing the segment to avoid
+	// temporarily having postings lists in the cache whose underlying
+	// bytes are no longer mmap'd.
+	closer := cacheCloser{segment: r}
+	closer.closeCaches()
 	return r.segment.Close()
 }
 
+type cacheCloser struct {
+	segment *ReadThroughSegment
+	closed  []*PostingsListCache
+}
+
+func (c *cacheCloser) closeCaches() {
+	c.closeCache(c.segment.caches.SegmentPostingsListCache)
+	c.closeCache(c.segment.caches.SearchPostingsListCache)
+}
+
+func (c *cacheCloser) closeCache(cache *PostingsListCache) {
+	if cache == nil {
+		return
+	}
+	for _, elem := range c.closed {
+		if elem == cache {
+			// Already closed.
+			break
+		}
+	}
+	// Close.
+	cache.PurgeSegment(c.segment.uuid)
+	// Add to list of unique closed caches.
+	c.closed = append(c.closed, cache)
+}
+
 // FieldsIterable is a pass through call to the segment, since there's no
 // postings lists to cache for queries.
 func (r *ReadThroughSegment) FieldsIterable() segment.FieldsIterable {
@@ -180,25 +209,25 @@ type readThroughSegmentReader struct {
 	// reader is explicitly not embedded at the top level
 	// of the struct to force new methods added to index.Reader
 	// to be explicitly supported by the read through cache.
-	reader            segment.Reader
-	opts              ReadThroughSegmentOptions
-	uuid              uuid.UUID
-	postingsListCache *PostingsListCache
+	reader segment.Reader
+	opts   ReadThroughSegmentOptions
+	uuid   uuid.UUID
+	caches ReadThroughSegmentCaches
 }
 
 func newReadThroughSegmentReader(
 	seg *ReadThroughSegment,
 	reader segment.Reader,
 	uuid uuid.UUID,
-	cache *PostingsListCache,
+	caches ReadThroughSegmentCaches,
 	opts ReadThroughSegmentOptions,
 ) segment.Reader {
 	return &readThroughSegmentReader{
-		seg:               seg,
-		reader:            reader,
-		opts:              opts,
-		uuid:              uuid,
-		postingsListCache: cache,
+		seg:    seg,
+		reader: reader,
+		opts:   opts,
+		uuid:   uuid,
+		caches: caches,
 	}
 }
 
@@ -208,21 +237,22 @@ func (s *readThroughSegmentReader) MatchRegexp(
 	field []byte,
 	c index.CompiledRegex,
 ) (postings.List, error) {
-	if s.postingsListCache == nil || !s.opts.CacheRegexp {
+	cache := s.caches.SegmentPostingsListCache
+	if cache == nil || !s.opts.CacheRegexp {
 		return s.reader.MatchRegexp(field, c)
 	}
 
 	// TODO(rartoul): Would be nice to not allocate strings here.
 	fieldStr := string(field)
 	patternStr := c.FSTSyntax.String()
-	pl, ok := s.postingsListCache.GetRegexp(s.uuid, fieldStr, patternStr)
+	pl, ok := cache.GetRegexp(s.uuid, fieldStr, patternStr)
 	if ok {
 		return pl, nil
 	}
 
 	pl, err := s.reader.MatchRegexp(field, c)
 	if err == nil {
-		s.postingsListCache.PutRegexp(s.uuid, fieldStr, patternStr, pl)
+		cache.PutRegexp(s.uuid, fieldStr, patternStr, pl)
 	}
 	return pl, err
 }
@@ -232,21 +262,22 @@ func (s *readThroughSegmentReader) MatchRegexp(
 func (s *readThroughSegmentReader) MatchTerm(
 	field []byte, term []byte,
 ) (postings.List, error) {
-	if s.postingsListCache == nil || !s.opts.CacheTerms {
+	cache := s.caches.SegmentPostingsListCache
+	if cache == nil || !s.opts.CacheTerms {
 		return s.reader.MatchTerm(field, term)
 	}
 
 	// TODO(rartoul): Would be nice to not allocate strings here.
 	fieldStr := string(field)
 	patternStr := string(term)
-	pl, ok := s.postingsListCache.GetTerm(s.uuid, fieldStr, patternStr)
+	pl, ok := cache.GetTerm(s.uuid, fieldStr, patternStr)
 	if ok {
 		return pl, nil
 	}
 
 	pl, err := s.reader.MatchTerm(field, term)
 	if err == nil {
-		s.postingsListCache.PutTerm(s.uuid, fieldStr, patternStr, pl)
+		cache.PutTerm(s.uuid, fieldStr, patternStr, pl)
 	}
 	return pl, err
 }
@@ -254,20 +285,21 @@ func (s *readThroughSegmentReader) MatchTerm(
 // MatchField returns a cached posting list or queries the underlying
 // segment if their is a cache miss.
 func (s *readThroughSegmentReader) MatchField(field []byte) (postings.List, error) {
-	if s.postingsListCache == nil || !s.opts.CacheTerms {
+	cache := s.caches.SegmentPostingsListCache
+	if cache == nil || !s.opts.CacheTerms {
 		return s.reader.MatchField(field)
 	}
 
 	// TODO(rartoul): Would be nice to not allocate strings here.
 	fieldStr := string(field)
-	pl, ok := s.postingsListCache.GetField(s.uuid, fieldStr)
+	pl, ok := cache.GetField(s.uuid, fieldStr)
 	if ok {
 		return pl, nil
 	}
 
 	pl, err := s.reader.MatchField(field)
 	if err == nil {
-		s.postingsListCache.PutField(s.uuid, fieldStr, pl)
+		cache.PutField(s.uuid, fieldStr, pl)
 	}
 	return pl, err
 }
@@ -329,13 +361,14 @@ func (s *readThroughSegmentReader) Search(
 	query search.Query,
 	searcher search.Searcher,
 ) (postings.List, error) {
-	if s.postingsListCache == nil || !s.opts.CacheSearches {
+	cache := s.caches.SearchPostingsListCache
+	if cache == nil || !s.opts.CacheSearches {
 		return searcher.Search(s)
 	}
 
 	// TODO(r): Would be nice to not allocate strings here.
 	queryStr := query.String()
-	pl, ok := s.postingsListCache.GetSearch(s.uuid, queryStr)
+	pl, ok := cache.GetSearch(s.uuid, queryStr)
 	if ok {
 		return pl, nil
 	}
@@ -349,7 +382,7 @@ func (s *readThroughSegmentReader) Search(
 	count := 1
 	curr, ok := s.seg.searches.queries[queryStr]
 	if !ok {
-		if len(s.seg.searches.queries) >= maxUniqueQueryCount {
+		if len(s.seg.searches.queries) >= cache.size {
 			// Delete a random key to make room.
 			for k := range s.seg.searches.queries {
 				delete(s.seg.searches.queries, k)
@@ -374,7 +407,7 @@ func (s *readThroughSegmentReader) Search(
 		// Only cache the second time seen a recent query since
 		// copying the postings lists into a roaring postings list
 		// can be expensive (in PutSearch).
-		s.postingsListCache.PutSearch(s.uuid, queryStr, pl)
+		cache.PutSearch(s.uuid, queryStr, pl)
 	}
 
 	return pl, nil
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index 86761d728a..1957752f75 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -960,6 +960,12 @@ type Options interface {
 	// PostingsListCache returns the postings list cache.
 	PostingsListCache() *PostingsListCache
 
+	// SetSearchPostingsListCache sets the postings list cache.
+	SetSearchPostingsListCache(value *PostingsListCache) Options
+
+	// SearchPostingsListCache returns the postings list cache.
+	SearchPostingsListCache() *PostingsListCache
+
 	// SetReadThroughSegmentOptions sets the read through segment cache options.
 	SetReadThroughSegmentOptions(value ReadThroughSegmentOptions) Options
 

From 36a16908a1a728965af93c7e7cf62c99b1270aeb Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 18 Nov 2020 02:54:33 -0500
Subject: [PATCH 037/106] Use concurrent cache and striped context finalizers

---
 go.mod                                        |   1 +
 go.sum                                        |   3 +
 .../storage/index/postings_list_cache.go      | 116 +++++----
 .../storage/index/postings_list_cache_lru.go  | 237 ------------------
 .../index/postings_list_cache_lru_test.go     |  32 ---
 .../storage/index/read_through_segment.go     |  31 ---
 src/x/context/context.go                      | 131 ++++++----
 src/x/context/context_test.go                 |   6 +
 8 files changed, 159 insertions(+), 398 deletions(-)
 delete mode 100644 src/dbnode/storage/index/postings_list_cache_lru.go
 delete mode 100644 src/dbnode/storage/index/postings_list_cache_lru_test.go

diff --git a/go.mod b/go.mod
index 2bf36f54e1..1e3011c527 100644
--- a/go.mod
+++ b/go.mod
@@ -23,6 +23,7 @@ require (
 	github.com/containerd/continuity v0.0.0-20200413184840-d3ef23f19fbb // indirect
 	github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f
 	github.com/davecgh/go-spew v1.1.1
+	github.com/dgraph-io/ristretto v0.0.3
 	github.com/docker/go-connections v0.4.0 // indirect
 	github.com/fatih/color v1.10.0 // indirect
 	github.com/fortytw2/leaktest v1.2.1-0.20180901000122-b433bbd6d743
diff --git a/go.sum b/go.sum
index 4feefc4967..8dc572c330 100644
--- a/go.sum
+++ b/go.sum
@@ -173,8 +173,11 @@ github.com/daviddengcn/go-assert v0.0.0-20150305222929-ba7e68aeeff6 h1:OPIYL/VhQ
 github.com/daviddengcn/go-assert v0.0.0-20150305222929-ba7e68aeeff6/go.mod h1:N+OekMaElW3rSAfDdNX6Dff3HS237/OhC08jYFW4oCw=
 github.com/daviddengcn/go-villa v0.0.0-20160111144444-3f35da8ba982 h1:2Trx4ntMtxmus9nN2w1PIqJOI8jB3RjlnDnFm/ImlIU=
 github.com/daviddengcn/go-villa v0.0.0-20160111144444-3f35da8ba982/go.mod h1:U8xNoHcXfPnZzy9zCxeKRjaJgC1d3613rFHjZVVAqKc=
+github.com/dgraph-io/ristretto v0.0.3 h1:jh22xisGBjrEVnRZ1DVTpBVQm0Xndu8sMl0CWDzSIBI=
+github.com/dgraph-io/ristretto v0.0.3/go.mod h1:KPxhHT9ZxKefz+PCeOGsrHpl1qZ7i70dGTu2u+Ahh6E=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
+github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw=
 github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
 github.com/dgryski/go-sip13 v0.0.0-20190329191031-25c5027a8c7b/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
 github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ=
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index c2b2139792..3f1b6fc8c2 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -21,14 +21,16 @@
 package index
 
 import (
+	"bytes"
 	"errors"
-	"sync"
 	"time"
 
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/x/instrument"
 
+	"github.com/cespare/xxhash/v2"
+	"github.com/dgraph-io/ristretto"
 	"github.com/pborman/uuid"
 	"github.com/uber-go/tally"
 	"go.uber.org/zap"
@@ -41,20 +43,20 @@ var (
 
 // PatternType is an enum for the various pattern types. It allows us
 // separate them logically within the cache.
-type PatternType int
+type PatternType string
 
 // Closer represents a function that will close managed resources.
 type Closer func()
 
 const (
 	// PatternTypeRegexp indicates that the pattern is of type regexp.
-	PatternTypeRegexp PatternType = iota
+	PatternTypeRegexp PatternType = "regexp"
 	// PatternTypeTerm indicates that the pattern is of type term.
-	PatternTypeTerm
+	PatternTypeTerm PatternType = "term"
 	// PatternTypeField indicates that the pattern is of type field.
-	PatternTypeField
+	PatternTypeField PatternType = "field"
 	// PatternTypeSearch indicates that the pattern is of type search.
-	PatternTypeSearch
+	PatternTypeSearch PatternType = "search"
 
 	reportLoopInterval = 10 * time.Second
 	emptyPattern       = ""
@@ -79,9 +81,7 @@ func (o PostingsListCacheOptions) Validate() error {
 
 // PostingsListCache implements an LRU for caching queries and their results.
 type PostingsListCache struct {
-	sync.Mutex
-
-	lru *postingsListLRU
+	lru *ristretto.Cache
 
 	size    int
 	opts    PostingsListCacheOptions
@@ -99,7 +99,14 @@ func NewPostingsListCache(
 		return nil, nil, err
 	}
 
-	lru, err := newPostingsListLRU(size, nil)
+	lru, err := ristretto.NewCache(&ristretto.Config{
+		NumCounters: int64(10 * size), // number of keys to track frequency of.
+		MaxCost:     int64(size),      // maximum cost of cache.
+		BufferItems: 64,               // number of keys per Get buffer.
+		KeyToHash: func(k interface{}) (uint64, uint64) {
+			return k.(uint64), 0
+		},
+	})
 	if err != nil {
 		return nil, nil, err
 	}
@@ -156,10 +163,15 @@ func (q *PostingsListCache) get(
 	pattern string,
 	patternType PatternType,
 ) (postings.List, bool) {
-	// No RLock because a Get() operation mutates the LRU.
-	q.Lock()
-	p, ok := q.lru.Get(segmentUUID, field, pattern, patternType)
-	q.Unlock()
+	var pl *cachedPostings
+	entry, ok := q.lru.Get(keyHash(segmentUUID, field, pattern, patternType))
+	if ok {
+		pl = entry.(*cachedPostings)
+		ok = bytes.Equal(segmentUUID, pl.segmentUUID) &&
+			field == pl.field &&
+			pattern == pl.pattern &&
+			patternType == pl.patternType
+	}
 
 	q.emitCacheGetMetrics(patternType, ok)
 
@@ -167,7 +179,33 @@ func (q *PostingsListCache) get(
 		return nil, false
 	}
 
-	return p, ok
+	return pl.postings, ok
+}
+
+type cachedPostings struct {
+	// key
+	segmentUUID uuid.UUID
+	field       string
+	pattern     string
+	patternType PatternType
+
+	// value
+	postings postings.List
+}
+
+func keyHash(
+	segmentUUID uuid.UUID,
+	field string,
+	pattern string,
+	patternType PatternType,
+) uint64 {
+	var h xxhash.Digest
+	h.Reset()
+	_, _ = h.Write(segmentUUID)
+	_, _ = h.WriteString(field)
+	_, _ = h.WriteString(pattern)
+	_, _ = h.WriteString(string(patternType))
+	return h.Sum64()
 }
 
 // PutRegexp updates the LRU with the result of the regexp query.
@@ -177,8 +215,7 @@ func (q *PostingsListCache) PutRegexp(
 	pattern string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, pattern, PatternTypeRegexp, pl,
-		postingsListMetadata{})
+	q.put(segmentUUID, field, pattern, PatternTypeRegexp, pl)
 }
 
 // PutTerm updates the LRU with the result of the term query.
@@ -188,8 +225,7 @@ func (q *PostingsListCache) PutTerm(
 	pattern string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, pattern, PatternTypeTerm, pl,
-		postingsListMetadata{})
+	q.put(segmentUUID, field, pattern, PatternTypeTerm, pl)
 }
 
 // PutField updates the LRU with the result of the field query.
@@ -198,8 +234,7 @@ func (q *PostingsListCache) PutField(
 	field string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, emptyPattern, PatternTypeField, pl,
-		postingsListMetadata{})
+	q.put(segmentUUID, field, emptyPattern, PatternTypeField, pl)
 }
 
 // PutSearch updates the LRU with the result of a search query.
@@ -208,12 +243,10 @@ func (q *PostingsListCache) PutSearch(
 	query string,
 	pl postings.List,
 ) {
-	pooled := false
 	if roaring.IsReadOnlyPostingsList(pl) {
 		// Copy into mutable postings list since it's expensive to read from
 		// a read only postings list over and over again (it's lazily
 		// evaluated from for allocation purposes).
-		pooled = true
 		mutable := q.opts.PostingsListPool.Get()
 		if err := mutable.AddIterator(pl.Iterator()); err != nil {
 			q.metrics.pooledGetErrAddIter.Inc(1)
@@ -223,8 +256,7 @@ func (q *PostingsListCache) PutSearch(
 		pl = mutable
 	}
 
-	q.put(segmentUUID, query, emptyPattern, PatternTypeSearch, pl,
-		postingsListMetadata{Pooled: pooled})
+	q.put(segmentUUID, query, emptyPattern, PatternTypeSearch, pl)
 }
 
 func (q *PostingsListCache) put(
@@ -233,22 +265,19 @@ func (q *PostingsListCache) put(
 	pattern string,
 	patternType PatternType,
 	pl postings.List,
-	meta postingsListMetadata,
 ) {
-	q.Lock()
-	q.lru.Add(segmentUUID, field, pattern, patternType, pl, meta)
-	q.Unlock()
+	key := keyHash(segmentUUID, field, pattern, patternType)
+	value := &cachedPostings{
+		segmentUUID: segmentUUID,
+		field:       field,
+		pattern:     pattern,
+		patternType: patternType,
+		postings:    pl,
+	}
+	q.lru.Set(key, value, 1)
 	q.emitCachePutMetrics(patternType)
 }
 
-// PurgeSegment removes all postings lists associated with the specified
-// segment from the cache.
-func (q *PostingsListCache) PurgeSegment(segmentUUID uuid.UUID) {
-	q.Lock()
-	q.lru.PurgeSegment(segmentUUID)
-	q.Unlock()
-}
-
 // startReportLoop starts a background process that will call Report()
 // on a regular basis and returns a function that will end the background
 // process.
@@ -273,18 +302,7 @@ func (q *PostingsListCache) startReportLoop() Closer {
 
 // Report will emit metrics about the status of the cache.
 func (q *PostingsListCache) Report() {
-	var (
-		size     float64
-		capacity float64
-	)
-
-	q.Lock()
-	size = float64(q.lru.Len())
-	capacity = float64(q.size)
-	q.Unlock()
-
-	q.metrics.size.Update(size)
-	q.metrics.capacity.Update(capacity)
+	q.metrics.capacity.Update(float64(q.size))
 }
 
 func (q *PostingsListCache) emitCacheGetMetrics(patternType PatternType, hit bool) {
diff --git a/src/dbnode/storage/index/postings_list_cache_lru.go b/src/dbnode/storage/index/postings_list_cache_lru.go
deleted file mode 100644
index ed27ac42b0..0000000000
--- a/src/dbnode/storage/index/postings_list_cache_lru.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright (c) 2019 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-package index
-
-import (
-	"container/list"
-	"errors"
-
-	"github.com/m3db/m3/src/m3ninx/postings"
-
-	"github.com/pborman/uuid"
-)
-
-// PostingsListLRU implements a non-thread safe fixed size LRU cache of postings lists
-// that were resolved by running a given query against a particular segment for a given
-// field and pattern type (term vs regexp). Normally a key in the LRU would look like:
-//
-// type key struct {
-//    segmentUUID uuid.UUID
-//    field       string
-//    pattern     string
-//    patternType PatternType
-// }
-//
-// However, some of the postings lists that we will store in the LRU have a fixed lifecycle
-// because they reference mmap'd byte slices which will eventually be unmap'd. To prevent
-// these postings lists that point to unmap'd regions from remaining in the LRU, we want to
-// support the ability to efficiently purge the LRU of any postings list that belong to a
-// given segment. This isn't technically required for correctness as once a segment has been
-// closed, its old postings list in the LRU will never be accessed again (since they are only
-// addressable by that segments UUID), but we purge them from the LRU before closing the segment
-// anyways as an additional safety precaution.
-//
-// Instead of adding additional tracking on-top of an existing generic LRU, we've created a
-// specialized LRU that instead of having a single top-level map pointing into the linked-list,
-// has a two-level map where the top level map is keyed by segment UUID and the second level map
-// is keyed by the field/pattern/patternType.
-//
-// As a result, when a segment is ready to be closed, they can call into the cache with their
-// UUID and we can efficiently remove all the entries corresponding to that segment from the
-// LRU. The specialization has the additional nice property that we don't need to allocate everytime
-// we add an item to the LRU due to the interface{} conversion.
-type postingsListLRU struct {
-	size      int
-	evictList *list.List
-	items     map[uuid.Array]map[key]*list.Element
-	onRemove  onRemoveFn
-}
-
-type onRemoveFn func(pl postings.List, metadata postingsListMetadata)
-
-// postingsListMetadata is metadata about the postings list.
-type postingsListMetadata struct {
-	Pooled bool
-}
-
-// entry is used to hold a value in the evictList.
-type entry struct {
-	uuid         uuid.UUID
-	key          key
-	postingsList postings.List
-	metadata     postingsListMetadata
-}
-
-type key struct {
-	field       string
-	pattern     string
-	patternType PatternType
-}
-
-// newPostingsListLRU constructs an LRU of the given size.
-func newPostingsListLRU(
-	size int,
-	onRemove onRemoveFn,
-) (*postingsListLRU, error) {
-	if size <= 0 {
-		return nil, errors.New("Must provide a positive size")
-	}
-
-	return &postingsListLRU{
-		size:      size,
-		evictList: list.New(),
-		items:     make(map[uuid.Array]map[key]*list.Element),
-		onRemove:  onRemove,
-	}, nil
-}
-
-// Add adds a value to the cache. Returns true if an eviction occurred.
-func (c *postingsListLRU) Add(
-	segmentUUID uuid.UUID,
-	field string,
-	pattern string,
-	patternType PatternType,
-	pl postings.List,
-	metadata postingsListMetadata,
-) (evicted bool) {
-	newKey := newKey(field, pattern, patternType)
-	// Check for existing item.
-	uuidArray := segmentUUID.Array()
-	if uuidEntries, ok := c.items[uuidArray]; ok {
-		if ent, ok := uuidEntries[newKey]; ok {
-			// If it already exists, just move it to the front. This avoids storing
-			// the same item in the LRU twice which is important because the maps
-			// can only point to one entry at a time and we use them for purges. Also,
-			// it saves space by avoiding storing duplicate values.
-			c.evictList.MoveToFront(ent)
-			e := ent.Value.(*entry)
-			e.postingsList = pl
-			e.metadata = metadata
-			return false
-		}
-	}
-
-	// Add new item.
-	var (
-		ent = &entry{
-			uuid:         segmentUUID,
-			key:          newKey,
-			postingsList: pl,
-			metadata:     metadata,
-		}
-		entry = c.evictList.PushFront(ent)
-	)
-	if queries, ok := c.items[uuidArray]; ok {
-		queries[newKey] = entry
-	} else {
-		c.items[uuidArray] = map[key]*list.Element{
-			newKey: entry,
-		}
-	}
-
-	evict := c.evictList.Len() > c.size
-	// Verify size not exceeded.
-	if evict {
-		c.removeOldest()
-	}
-	return evict
-}
-
-// Get looks up a key's value from the cache.
-func (c *postingsListLRU) Get(
-	segmentUUID uuid.UUID,
-	field string,
-	pattern string,
-	patternType PatternType,
-) (postings.List, bool) {
-	newKey := newKey(field, pattern, patternType)
-	uuidArray := segmentUUID.Array()
-	if uuidEntries, ok := c.items[uuidArray]; ok {
-		if ent, ok := uuidEntries[newKey]; ok {
-			c.evictList.MoveToFront(ent)
-			return ent.Value.(*entry).postingsList, true
-		}
-	}
-
-	return nil, false
-}
-
-// Remove removes the provided key from the cache, returning if the
-// key was contained.
-func (c *postingsListLRU) Remove(
-	segmentUUID uuid.UUID,
-	field string,
-	pattern string,
-	patternType PatternType,
-) bool {
-	newKey := newKey(field, pattern, patternType)
-	uuidArray := segmentUUID.Array()
-	if uuidEntries, ok := c.items[uuidArray]; ok {
-		if ent, ok := uuidEntries[newKey]; ok {
-			c.removeElement(ent)
-			return true
-		}
-	}
-
-	return false
-}
-
-func (c *postingsListLRU) PurgeSegment(segmentUUID uuid.UUID) {
-	if uuidEntries, ok := c.items[segmentUUID.Array()]; ok {
-		for _, ent := range uuidEntries {
-			c.removeElement(ent)
-		}
-	}
-}
-
-// Len returns the number of items in the cache.
-func (c *postingsListLRU) Len() int {
-	return c.evictList.Len()
-}
-
-// removeOldest removes the oldest item from the cache.
-func (c *postingsListLRU) removeOldest() {
-	ent := c.evictList.Back()
-	if ent != nil {
-		c.removeElement(ent)
-	}
-}
-
-// removeElement is used to remove a given list element from the cache
-func (c *postingsListLRU) removeElement(e *list.Element) {
-	c.evictList.Remove(e)
-	entry := e.Value.(*entry)
-
-	if patterns, ok := c.items[entry.uuid.Array()]; ok {
-		delete(patterns, entry.key)
-		if len(patterns) == 0 {
-			delete(c.items, entry.uuid.Array())
-		}
-	}
-
-	if c.onRemove != nil {
-		c.onRemove(entry.postingsList, entry.metadata)
-	}
-}
-
-func newKey(field, pattern string, patternType PatternType) key {
-	return key{field: field, pattern: pattern, patternType: patternType}
-}
diff --git a/src/dbnode/storage/index/postings_list_cache_lru_test.go b/src/dbnode/storage/index/postings_list_cache_lru_test.go
deleted file mode 100644
index 2fb384e767..0000000000
--- a/src/dbnode/storage/index/postings_list_cache_lru_test.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 Uber Technologies, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-package index
-
-// Keys returns a slice of the keys in the cache, from oldest to newest. Used for
-// testing only.
-func (c *postingsListLRU) keys() []key {
-	keys := make([]key, 0, len(c.items))
-	for ent := c.evictList.Back(); ent != nil; ent = ent.Prev() {
-		entry := ent.Value.(*entry)
-		keys = append(keys, entry.key)
-	}
-	return keys
-}
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index ad5e0c5ae7..d3ba525ded 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -133,40 +133,9 @@ func (r *ReadThroughSegment) Close() error {
 
 	r.closed = true
 
-	// Purge segments from the cache before closing the segment to avoid
-	// temporarily having postings lists in the cache whose underlying
-	// bytes are no longer mmap'd.
-	closer := cacheCloser{segment: r}
-	closer.closeCaches()
 	return r.segment.Close()
 }
 
-type cacheCloser struct {
-	segment *ReadThroughSegment
-	closed  []*PostingsListCache
-}
-
-func (c *cacheCloser) closeCaches() {
-	c.closeCache(c.segment.caches.SegmentPostingsListCache)
-	c.closeCache(c.segment.caches.SearchPostingsListCache)
-}
-
-func (c *cacheCloser) closeCache(cache *PostingsListCache) {
-	if cache == nil {
-		return
-	}
-	for _, elem := range c.closed {
-		if elem == cache {
-			// Already closed.
-			break
-		}
-	}
-	// Close.
-	cache.PurgeSegment(c.segment.uuid)
-	// Add to list of unique closed caches.
-	c.closed = append(c.closed, cache)
-}
-
 // FieldsIterable is a pass through call to the segment, since there's no
 // postings lists to cache for queries.
 func (r *ReadThroughSegment) FieldsIterable() segment.FieldsIterable {
diff --git a/src/x/context/context.go b/src/x/context/context.go
index c6617e79e8..0214a8aa03 100644
--- a/src/x/context/context.go
+++ b/src/x/context/context.go
@@ -26,6 +26,7 @@ import (
 
 	xopentracing "github.com/m3db/m3/src/x/opentracing"
 	xresource "github.com/m3db/m3/src/x/resource"
+	xsync "github.com/m3db/m3/src/x/sync"
 
 	lightstep "github.com/lightstep/lightstep-tracer-go"
 	"github.com/opentracing/opentracing-go"
@@ -33,6 +34,8 @@ import (
 	"github.com/uber/jaeger-client-go"
 )
 
+const finalizeableListSlots = 16
+
 var (
 	noopTracer opentracing.NoopTracer
 )
@@ -42,15 +45,23 @@ var (
 type ctx struct {
 	sync.RWMutex
 
-	goCtx                stdctx.Context
-	pool                 contextPool
-	done                 bool
-	wg                   sync.WaitGroup
-	finalizeables        *finalizeableList
+	goCtx stdctx.Context
+	pool  contextPool
+	done  bool
+	wg    sync.WaitGroup
+
+	// Used fixed size allocation.
+	finalizeables [16]finalizeableListSlot
+
 	parent               Context
 	checkedAndNotSampled bool
 }
 
+type finalizeableListSlot struct {
+	lock sync.Mutex
+	list *finalizeableList
+}
+
 type finalizeable struct {
 	finalizer xresource.Finalizer
 	closer    xresource.SimpleCloser
@@ -116,46 +127,61 @@ func (c *ctx) RegisterCloser(f xresource.SimpleCloser) {
 	c.registerFinalizeable(finalizeable{closer: f})
 }
 
+func slot() int {
+	return xsync.CPUCore() % finalizeableListSlots
+}
+
 func (c *ctx) registerFinalizeable(f finalizeable) {
-	if c.Lock(); c.done {
-		c.Unlock()
+	if c.RLock(); c.done {
+		c.RUnlock()
 		return
 	}
 
-	if c.finalizeables == nil {
+	idx := slot()
+	c.finalizeables[idx].lock.Lock()
+	if c.finalizeables[idx].list == nil {
 		if c.pool != nil {
-			c.finalizeables = c.pool.getFinalizeablesList()
+			c.finalizeables[idx].list = c.pool.getFinalizeablesList()
 		} else {
-			c.finalizeables = newFinalizeableList(nil)
+			c.finalizeables[idx].list = newFinalizeableList(nil)
 		}
 	}
-	c.finalizeables.PushBack(f)
+	c.finalizeables[idx].list.PushBack(f)
+	c.finalizeables[idx].lock.Unlock()
 
-	c.Unlock()
+	c.RUnlock()
 }
 
 func (c *ctx) numFinalizeables() int {
-	if c.finalizeables == nil {
-		return 0
+	var n int
+	for idx := range c.finalizeables {
+		c.finalizeables[idx].lock.Lock()
+		if c.finalizeables[idx].list != nil {
+			n += c.finalizeables[idx].list.Len()
+		}
+		c.finalizeables[idx].lock.Unlock()
 	}
-	return c.finalizeables.Len()
+	return n
 }
 
 func (c *ctx) DependsOn(blocker Context) {
-	parent := c.parentCtx()
+	c.RLock()
+	parent := c.parentCtxWithRLock()
 	if parent != nil {
+		c.RUnlock()
 		parent.DependsOn(blocker)
 		return
 	}
-
-	c.Lock()
-
-	if !c.done {
+	done := c.done
+	if !done {
 		c.wg.Add(1)
-		blocker.RegisterFinalizer(c)
 	}
+	c.RUnlock()
 
-	c.Unlock()
+	if !done {
+		// Register outside of RLock.
+		blocker.RegisterFinalizer(c)
+	}
 }
 
 // Finalize handles a call from another context that was depended upon closing.
@@ -225,47 +251,47 @@ func (c *ctx) close(mode closeMode, returnMode returnToPoolMode) {
 		c.Unlock()
 		return
 	}
-
 	c.done = true
-
-	// Capture finalizeables to avoid concurrent r/w if Reset
-	// is used after a caller waits for the finalizers to finish
-	f := c.finalizeables
-	c.finalizeables = nil
-
 	c.Unlock()
 
-	if f == nil {
-		c.tryReturnToPool(returnMode)
-		return
-	}
-
 	switch mode {
 	case closeAsync:
-		go c.finalize(f, returnMode)
+		go c.finalize(returnMode)
 	case closeBlock:
-		c.finalize(f, returnMode)
+		c.finalize(returnMode)
 	}
 }
 
-func (c *ctx) finalize(f *finalizeableList, returnMode returnToPoolMode) {
+func (c *ctx) finalize(returnMode returnToPoolMode) {
 	// Wait for dependencies.
 	c.wg.Wait()
 
 	// Now call finalizers.
-	for elem := f.Front(); elem != nil; elem = elem.Next() {
-		if elem.Value.finalizer != nil {
-			elem.Value.finalizer.Finalize()
+	for idx := range c.finalizeables {
+		c.finalizeables[idx].lock.Lock()
+		f := c.finalizeables[idx].list
+		c.finalizeables[idx].list = nil
+		c.finalizeables[idx].lock.Unlock()
+
+		if f == nil {
+			// Nothing to callback.
+			continue
 		}
-		if elem.Value.closer != nil {
-			elem.Value.closer.Close()
+
+		for elem := f.Front(); elem != nil; elem = elem.Next() {
+			if elem.Value.finalizer != nil {
+				elem.Value.finalizer.Finalize()
+			}
+			if elem.Value.closer != nil {
+				elem.Value.closer.Close()
+			}
 		}
-	}
 
-	if c.pool != nil {
-		// NB(r): Always return finalizeables, only the
-		// context itself might want to be reused immediately.
-		c.pool.putFinalizeablesList(f)
+		if c.pool != nil {
+			// NB(r): Always return finalizeables, only the
+			// context itself might want to be reused immediately.
+			c.pool.putFinalizeablesList(f)
+		}
 	}
 
 	c.tryReturnToPool(returnMode)
@@ -279,7 +305,10 @@ func (c *ctx) Reset() {
 	}
 
 	c.Lock()
-	c.done, c.finalizeables, c.goCtx, c.checkedAndNotSampled = false, nil, nil, false
+	c.done, c.goCtx, c.checkedAndNotSampled = false, nil, false
+	for idx := range c.finalizeables {
+		c.finalizeables[idx] = finalizeableListSlot{}
+	}
 	c.Unlock()
 }
 
@@ -317,12 +346,16 @@ func (c *ctx) setParentCtx(parentCtx Context) {
 
 func (c *ctx) parentCtx() Context {
 	c.RLock()
-	parent := c.parent
+	parent := c.parentCtxWithRLock()
 	c.RUnlock()
 
 	return parent
 }
 
+func (c *ctx) parentCtxWithRLock() Context {
+	return c.parent
+}
+
 func (c *ctx) StartSampledTraceSpan(name string) (Context, opentracing.Span, bool) {
 	goCtx, exists := c.GoContext()
 	if !exists || c.checkedAndNotSampled {
diff --git a/src/x/context/context_test.go b/src/x/context/context_test.go
index b2afe6abfe..18cad9eba6 100644
--- a/src/x/context/context_test.go
+++ b/src/x/context/context_test.go
@@ -22,10 +22,12 @@ package context
 
 import (
 	stdctx "context"
+	"fmt"
 	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
+	"unsafe"
 
 	"github.com/opentracing/opentracing-go"
 	"github.com/opentracing/opentracing-go/mocktracer"
@@ -298,3 +300,7 @@ func TestGoContext(t *testing.T) {
 	assert.False(t, exists)
 	assert.Nil(t, returnCtx)
 }
+
+func TestUnsafe(t *testing.T) {
+	fmt.Printf("%d\n", unsafe.Sizeof(ctx{}))
+}

From 0af1add574e71800795bb2bb7e0d9cec8c1ba094 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 18 Nov 2020 03:03:59 -0500
Subject: [PATCH 038/106] Use const for array size

---
 src/x/context/context.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/x/context/context.go b/src/x/context/context.go
index 0214a8aa03..39c0c7af6a 100644
--- a/src/x/context/context.go
+++ b/src/x/context/context.go
@@ -51,7 +51,7 @@ type ctx struct {
 	wg    sync.WaitGroup
 
 	// Used fixed size allocation.
-	finalizeables [16]finalizeableListSlot
+	finalizeables [finalizeableListSlots]finalizeableListSlot
 
 	parent               Context
 	checkedAndNotSampled bool

From a41e8934d53f9e6ec1a715de34de2521fc12dc9e Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 18 Nov 2020 03:06:59 -0500
Subject: [PATCH 039/106] Remove test unsafe

---
 src/x/context/context_test.go | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/x/context/context_test.go b/src/x/context/context_test.go
index 18cad9eba6..b2afe6abfe 100644
--- a/src/x/context/context_test.go
+++ b/src/x/context/context_test.go
@@ -22,12 +22,10 @@ package context
 
 import (
 	stdctx "context"
-	"fmt"
 	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
-	"unsafe"
 
 	"github.com/opentracing/opentracing-go"
 	"github.com/opentracing/opentracing-go/mocktracer"
@@ -300,7 +298,3 @@ func TestGoContext(t *testing.T) {
 	assert.False(t, exists)
 	assert.Nil(t, returnCtx)
 }
-
-func TestUnsafe(t *testing.T) {
-	fmt.Printf("%d\n", unsafe.Sizeof(ctx{}))
-}

From ef5dd158222fb5e4117253f5a484fb4f4eade3f9 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 18 Nov 2020 15:45:04 -0500
Subject: [PATCH 040/106] Rename to slot index

---
 src/x/context/context.go | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/src/x/context/context.go b/src/x/context/context.go
index 39c0c7af6a..28bec35766 100644
--- a/src/x/context/context.go
+++ b/src/x/context/context.go
@@ -127,39 +127,35 @@ func (c *ctx) RegisterCloser(f xresource.SimpleCloser) {
 	c.registerFinalizeable(finalizeable{closer: f})
 }
 
-func slot() int {
-	return xsync.CPUCore() % finalizeableListSlots
-}
-
 func (c *ctx) registerFinalizeable(f finalizeable) {
 	if c.RLock(); c.done {
 		c.RUnlock()
 		return
 	}
 
-	idx := slot()
-	c.finalizeables[idx].lock.Lock()
-	if c.finalizeables[idx].list == nil {
+	slot := xsync.CPUCore() % finalizeableListSlots
+	c.finalizeables[slot].lock.Lock()
+	if c.finalizeables[slot].list == nil {
 		if c.pool != nil {
-			c.finalizeables[idx].list = c.pool.getFinalizeablesList()
+			c.finalizeables[slot].list = c.pool.getFinalizeablesList()
 		} else {
-			c.finalizeables[idx].list = newFinalizeableList(nil)
+			c.finalizeables[slot].list = newFinalizeableList(nil)
 		}
 	}
-	c.finalizeables[idx].list.PushBack(f)
-	c.finalizeables[idx].lock.Unlock()
+	c.finalizeables[slot].list.PushBack(f)
+	c.finalizeables[slot].lock.Unlock()
 
 	c.RUnlock()
 }
 
 func (c *ctx) numFinalizeables() int {
 	var n int
-	for idx := range c.finalizeables {
-		c.finalizeables[idx].lock.Lock()
-		if c.finalizeables[idx].list != nil {
-			n += c.finalizeables[idx].list.Len()
+	for slot := range c.finalizeables {
+		c.finalizeables[slot].lock.Lock()
+		if c.finalizeables[slot].list != nil {
+			n += c.finalizeables[slot].list.Len()
 		}
-		c.finalizeables[idx].lock.Unlock()
+		c.finalizeables[slot].lock.Unlock()
 	}
 	return n
 }
@@ -267,11 +263,11 @@ func (c *ctx) finalize(returnMode returnToPoolMode) {
 	c.wg.Wait()
 
 	// Now call finalizers.
-	for idx := range c.finalizeables {
-		c.finalizeables[idx].lock.Lock()
-		f := c.finalizeables[idx].list
-		c.finalizeables[idx].list = nil
-		c.finalizeables[idx].lock.Unlock()
+	for slot := range c.finalizeables {
+		c.finalizeables[slot].lock.Lock()
+		f := c.finalizeables[slot].list
+		c.finalizeables[slot].list = nil
+		c.finalizeables[slot].lock.Unlock()
 
 		if f == nil {
 			// Nothing to callback.

From 7f50c75c9e9ffe22553ccca607baa7bef2627701 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 18 Nov 2020 18:16:53 -0500
Subject: [PATCH 041/106] Add more realtime stats to number of mutable segments

---
 src/dbnode/storage/index.go                  | 97 +++++++++++++-------
 src/dbnode/storage/index/block.go            | 25 +++--
 src/dbnode/storage/index/mutable_segments.go | 63 +++++++------
 src/dbnode/storage/index/types.go            | 17 +++-
 4 files changed, 132 insertions(+), 70 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 00d22ab5a4..732be75d42 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -560,7 +560,11 @@ func (i *nsIndex) BlockStartForWriteTime(writeTime time.Time) xtime.UnixNano {
 }
 
 func (i *nsIndex) BlockForBlockStart(blockStart time.Time) (index.Block, error) {
-	return i.ensureBlockPresent(blockStart)
+	result, err := i.ensureBlockPresent(blockStart)
+	if err != nil {
+		return nil, err
+	}
+	return result.block, nil
 }
 
 // NB(prateek): including the call chains leading to this point:
@@ -780,7 +784,7 @@ func (i *nsIndex) writeBatchForBlockStart(
 	// block we release it so we don't block the tick, etc when we insert
 	// batches since writing batches can take significant time when foreground
 	// compaction occurs.
-	block, err := i.ensureBlockPresent(blockStart)
+	blockResult, err := i.ensureBlockPresent(blockStart)
 	if err != nil {
 		batch.MarkUnmarkedEntriesError(err)
 		i.logger.Error("unable to write to index, dropping inserts",
@@ -797,9 +801,9 @@ func (i *nsIndex) writeBatchForBlockStart(
 	i.metrics.asyncInsertAttemptWrite.Inc(int64(numPending))
 
 	// i.e. we have the block and the inserts, perform the writes.
-	result, err := block.WriteBatch(batch)
+	result, err := blockResult.block.WriteBatch(batch)
 
-	// record the end to end indexing latency
+	// Record the end to end indexing latency.
 	now := i.nowFn()
 	for idx := range pending {
 		took := now.Sub(pending[idx].EnqueuedAt)
@@ -812,6 +816,12 @@ func (i *nsIndex) writeBatchForBlockStart(
 		i.metrics.asyncInsertSuccess.Inc(n)
 	}
 
+	// Record mutable segments count foreground/background if latest block.
+	if stats := result.MutableSegmentsStats; !stats.Empty() && blockResult.latest {
+		i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.NumForeground))
+		i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.NumBackground))
+	}
+
 	// Allow for duplicate write errors since due to re-indexing races
 	// we may try to re-index a series more than once.
 	if err := i.sanitizeAllowDuplicatesWriteError(err); err != nil {
@@ -848,12 +858,12 @@ func (i *nsIndex) Bootstrap(
 
 	var multiErr xerrors.MultiError
 	for blockStart, blockResults := range bootstrapResults {
-		block, err := i.ensureBlockPresentWithRLock(blockStart.ToTime())
+		blockResult, err := i.ensureBlockPresentWithRLock(blockStart.ToTime())
 		if err != nil { // should never happen
 			multiErr = multiErr.Add(i.unableToAllocBlockInvariantError(err))
 			continue
 		}
-		if err := block.AddResults(blockResults); err != nil {
+		if err := blockResult.block.AddResults(blockResults); err != nil {
 			multiErr = multiErr.Add(err)
 		}
 	}
@@ -1040,13 +1050,13 @@ func (i *nsIndex) flushableBlocks(
 	currentBlockStart := now.Truncate(i.blockSize)
 	// Check for flushable blocks by iterating through all block starts w/in retention.
 	for blockStart := earliestBlockStartToRetain; blockStart.Before(currentBlockStart); blockStart = blockStart.Add(i.blockSize) {
-		block, err := i.ensureBlockPresentWithRLock(blockStart)
+		blockResult, err := i.ensureBlockPresentWithRLock(blockStart)
 		if err != nil {
 			return nil, err
 		}
 
 		canFlush, err := i.canFlushBlockWithRLock(infoFiles, now, blockStart,
-			block, shards, flushType)
+			blockResult.block, shards, flushType)
 		if err != nil {
 			return nil, err
 		}
@@ -1054,7 +1064,7 @@ func (i *nsIndex) flushableBlocks(
 			continue
 		}
 
-		flushable = append(flushable, block)
+		flushable = append(flushable, blockResult.block)
 	}
 	return flushable, nil
 }
@@ -1834,11 +1844,16 @@ func (i *nsIndex) overriddenOptsForQueryWithRLock(
 	return opts
 }
 
-func (i *nsIndex) ensureBlockPresent(blockStart time.Time) (index.Block, error) {
+type blockPresentResult struct {
+	block  index.Block
+	latest bool
+}
+
+func (i *nsIndex) ensureBlockPresent(blockStart time.Time) (blockPresentResult, error) {
 	i.state.RLock()
 	defer i.state.RUnlock()
 	if !i.isOpenWithRLock() {
-		return nil, errDbIndexUnableToWriteClosed
+		return blockPresentResult{}, errDbIndexUnableToWriteClosed
 	}
 	return i.ensureBlockPresentWithRLock(blockStart)
 }
@@ -1846,19 +1861,22 @@ func (i *nsIndex) ensureBlockPresent(blockStart time.Time) (index.Block, error)
 // ensureBlockPresentWithRLock guarantees an index.Block exists for the specified
 // blockStart, allocating one if it does not. It returns the desired block, or
 // error if it's unable to do so.
-func (i *nsIndex) ensureBlockPresentWithRLock(blockStart time.Time) (index.Block, error) {
+func (i *nsIndex) ensureBlockPresentWithRLock(blockStart time.Time) (blockPresentResult, error) {
 	// check if the current latest block matches the required block, this
 	// is the usual path and can short circuit the rest of the logic in this
 	// function in most cases.
 	if i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart) {
-		return i.state.latestBlock, nil
+		return blockPresentResult{
+			block:  i.state.latestBlock,
+			latest: true,
+		}, nil
 	}
 
 	// check if exists in the map (this can happen if the latestBlock has not
 	// been rotated yet).
 	blockStartNanos := xtime.ToUnixNano(blockStart)
 	if block, ok := i.state.blocksByTime[blockStartNanos]; ok {
-		return block, nil
+		return blockPresentResult{block: block}, nil
 	}
 
 	// i.e. block start does not exist, so we have to alloc.
@@ -1876,21 +1894,21 @@ func (i *nsIndex) ensureBlockPresentWithRLock(blockStart time.Time) (index.Block
 
 	// re-check if exists in the map (another routine did the alloc)
 	if block, ok := i.state.blocksByTime[blockStartNanos]; ok {
-		return block, nil
+		return blockPresentResult{block: block}, nil
 	}
 
 	// ok now we know for sure we have to alloc
 	block, err := i.newBlockFn(blockStart, i.nsMetadata,
 		index.BlockOptions{}, i.namespaceRuntimeOptsMgr, i.opts.IndexOptions())
 	if err != nil { // unable to allocate the block, should never happen.
-		return nil, i.unableToAllocBlockInvariantError(err)
+		return blockPresentResult{}, i.unableToAllocBlockInvariantError(err)
 	}
 
 	// NB(bodu): Use same time barrier as `Tick` to make sealing of cold index blocks consistent.
 	// We need to seal cold blocks write away for cold writes.
 	if !blockStart.After(i.lastSealableBlockStart(i.nowFn())) {
 		if err := block.Seal(); err != nil {
-			return nil, err
+			return blockPresentResult{}, err
 		}
 	}
 
@@ -1899,7 +1917,10 @@ func (i *nsIndex) ensureBlockPresentWithRLock(blockStart time.Time) (index.Block
 
 	// update ordered blockStarts slice, and latestBlock
 	i.updateBlockStartsWithLock()
-	return block, nil
+	return blockPresentResult{
+		block:  block,
+		latest: i.state.latestBlock.StartTime().Equal(blockStart),
+	}, nil
 }
 
 func (i *nsIndex) lastSealableBlockStart(t time.Time) time.Time {
@@ -2183,22 +2204,24 @@ type nsIndexMetrics struct {
 	asyncInsertAttemptSkip  tally.Counter
 	asyncInsertAttemptWrite tally.Counter
 
-	asyncInsertSuccess           tally.Counter
-	asyncInsertErrors            tally.Counter
-	insertAfterClose             tally.Counter
-	queryAfterClose              tally.Counter
-	forwardIndexHits             tally.Counter
-	forwardIndexMisses           tally.Counter
-	forwardIndexCounter          tally.Counter
-	insertEndToEndLatency        tally.Timer
-	blocksEvictedMutableSegments tally.Counter
-	blockMetrics                 nsIndexBlocksMetrics
-	indexingConcurrencyMin       tally.Gauge
-	indexingConcurrencyMax       tally.Gauge
-	indexingConcurrencyAvg       tally.Gauge
-	flushIndexingConcurrency     tally.Gauge
-	flushDocsNew                 tally.Counter
-	flushDocsCached              tally.Counter
+	asyncInsertSuccess               tally.Counter
+	asyncInsertErrors                tally.Counter
+	insertAfterClose                 tally.Counter
+	queryAfterClose                  tally.Counter
+	forwardIndexHits                 tally.Counter
+	forwardIndexMisses               tally.Counter
+	forwardIndexCounter              tally.Counter
+	insertEndToEndLatency            tally.Timer
+	blocksEvictedMutableSegments     tally.Counter
+	blockMetrics                     nsIndexBlocksMetrics
+	indexingConcurrencyMin           tally.Gauge
+	indexingConcurrencyMax           tally.Gauge
+	indexingConcurrencyAvg           tally.Gauge
+	flushIndexingConcurrency         tally.Gauge
+	flushDocsNew                     tally.Counter
+	flushDocsCached                  tally.Counter
+	latestBlockNumSegmentsForeground tally.Gauge
+	latestBlockNumSegmentsBackground tally.Gauge
 
 	loadedDocsPerQuery                 tally.Histogram
 	queryExhaustiveSuccess             tally.Counter
@@ -2271,6 +2294,12 @@ func newNamespaceIndexMetrics(
 		flushDocsCached: scope.Tagged(map[string]string{
 			"status": "cached",
 		}).Counter("flush-docs"),
+		latestBlockNumSegmentsForeground: scope.Tagged(map[string]string{
+			"segment_type": "foreground",
+		}).Gauge("latest-block-num-segments"),
+		latestBlockNumSegmentsBackground: scope.Tagged(map[string]string{
+			"segment_type": "background",
+		}).Gauge("latest-block-num-segments"),
 		loadedDocsPerQuery: scope.Histogram(
 			"loaded-docs-per-query",
 			tally.MustMakeExponentialValueBuckets(10, 2, 16),
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index b6c3b8b03b..f3e1474c12 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -276,25 +276,32 @@ func (b *block) WriteBatch(inserts *WriteBatch) (WriteBatchResult, error) {
 	b.RLock()
 	if !b.writesAcceptedWithRLock() {
 		b.RUnlock()
-		return b.writeBatchResult(inserts, b.writeBatchErrorInvalidState(b.state))
+		return b.writeBatchResult(inserts, MutableSegmentsStats{},
+			b.writeBatchErrorInvalidState(b.state))
 	}
 	if b.state == blockStateSealed {
 		coldBlock := b.coldMutableSegments[len(b.coldMutableSegments)-1]
 		b.RUnlock()
-		return b.writeBatchResult(inserts, coldBlock.WriteBatch(inserts))
+		_, err := coldBlock.WriteBatch(inserts)
+		// Don't pass stats back from insertion into a cold block,
+		// we only care about warm mutable segments stats.
+		return b.writeBatchResult(inserts, MutableSegmentsStats{}, err)
 	}
 	b.RUnlock()
-	return b.writeBatchResult(inserts, b.mutableSegments.WriteBatch(inserts))
+	stats, err := b.mutableSegments.WriteBatch(inserts)
+	return b.writeBatchResult(inserts, stats, err)
 }
 
 func (b *block) writeBatchResult(
 	inserts *WriteBatch,
+	stats MutableSegmentsStats,
 	err error,
 ) (WriteBatchResult, error) {
 	if err == nil {
 		inserts.MarkUnmarkedEntriesSuccess()
 		return WriteBatchResult{
-			NumSuccess: int64(inserts.Len()),
+			NumSuccess:           int64(inserts.Len()),
+			MutableSegmentsStats: stats,
 		}, nil
 	}
 
@@ -302,7 +309,10 @@ func (b *block) writeBatchResult(
 	if !ok {
 		// NB: marking all the inserts as failure, cause we don't know which ones failed.
 		inserts.MarkUnmarkedEntriesError(err)
-		return WriteBatchResult{NumError: int64(inserts.Len())}, err
+		return WriteBatchResult{
+			NumError:             int64(inserts.Len()),
+			MutableSegmentsStats: stats,
+		}, err
 	}
 
 	numErr := len(partialErr.Errs())
@@ -314,8 +324,9 @@ func (b *block) writeBatchResult(
 	// Mark all non-error inserts success, so we don't repeatedly index them.
 	inserts.MarkUnmarkedEntriesSuccess()
 	return WriteBatchResult{
-		NumSuccess: int64(inserts.Len() - numErr),
-		NumError:   int64(numErr),
+		NumSuccess:           int64(inserts.Len() - numErr),
+		NumError:             int64(numErr),
+		MutableSegmentsStats: stats,
 	}, partialErr
 }
 
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index c8973cc1f4..ee11e9bdbe 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -141,15 +141,15 @@ func (m *mutableSegments) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptio
 	builder.SetSortConcurrency(m.writeIndexingConcurrency)
 }
 
-func (m *mutableSegments) WriteBatch(inserts *WriteBatch) error {
+func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats, error) {
 	m.Lock()
 	if m.state == mutableSegmentsStateClosed {
-		return errMutableSegmentsAlreadyClosed
+		return MutableSegmentsStats{}, errMutableSegmentsAlreadyClosed
 	}
 
 	if m.compact.compactingForeground {
 		m.Unlock()
-		return errUnableToWriteBlockConcurrent
+		return MutableSegmentsStats{}, errUnableToWriteBlockConcurrent
 	}
 
 	// Lazily allocate the segment builder and compactors.
@@ -157,7 +157,7 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) error {
 		m.blockOpts, m.opts)
 	if err != nil {
 		m.Unlock()
-		return err
+		return MutableSegmentsStats{}, err
 	}
 
 	m.compact.compactingForeground = true
@@ -178,19 +178,22 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) error {
 	})
 	if len(builder.Docs()) == 0 {
 		// No inserts, no need to compact.
-		return insertResultErr
+		return MutableSegmentsStats{}, insertResultErr
 	}
 
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment from the segment builder before we can serve reads
 	// from an FST segment.
-	err = m.foregroundCompactWithBuilder(builder)
+	result, err := m.foregroundCompactWithBuilder(builder)
 	if err != nil {
-		return err
+		return MutableSegmentsStats{}, err
 	}
 
 	// Return result from the original insertion since compaction was successful.
-	return insertResultErr
+	return MutableSegmentsStats{
+		NumForeground: result.numForeground,
+		NumBackground: result.numBackground,
+	}, insertResultErr
 }
 
 func (m *mutableSegments) AddReaders(readers []segment.Reader) ([]segment.Reader, error) {
@@ -539,7 +542,7 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
 	return append(result, newReadableSeg(compacted, m.opts))
 }
 
-func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.DocumentsBuilder) error {
+func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.DocumentsBuilder) (compactResult, error) {
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment.
 	m.Lock()
@@ -564,18 +567,18 @@ func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.Documents
 
 	plan, err := compaction.NewPlan(segs, m.opts.ForegroundCompactionPlannerOptions())
 	if err != nil {
-		return err
+		return compactResult{}, err
 	}
 
 	// Check plan
 	if len(plan.Tasks) == 0 {
 		// Should always generate a task when a mutable builder is passed to planner
-		return errForegroundCompactorNoPlan
+		return compactResult{}, errForegroundCompactorNoPlan
 	}
 	if taskNumBuilders(plan.Tasks[0]) != 1 {
 		// First task of plan must include the builder, so we can avoid resetting it
 		// for the first task, but then safely reset it in consequent tasks
-		return errForegroundCompactorBadPlanFirstTask
+		return compactResult{}, errForegroundCompactorBadPlanFirstTask
 	}
 
 	// Move any unused segments to the background.
@@ -609,11 +612,10 @@ func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.Documents
 	defer sw.Stop()
 
 	// Run the first task, without resetting the builder.
-	if err := m.foregroundCompactWithTask(
-		builder, plan.Tasks[0],
-		log, logger.With(zap.Int("task", 0)),
-	); err != nil {
-		return err
+	result, err := m.foregroundCompactWithTask(builder, plan.Tasks[0],
+		log, logger.With(zap.Int("task", 0)))
+	if err != nil {
+		return result, err
 	}
 
 	// Now run each consequent task, resetting the builder each time since
@@ -623,19 +625,18 @@ func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.Documents
 		task := plan.Tasks[i]
 		if taskNumBuilders(task) > 0 {
 			// Only the first task should compact the builder
-			return errForegroundCompactorBadPlanSecondaryTask
+			return result, errForegroundCompactorBadPlanSecondaryTask
 		}
 		// Now use the builder after resetting it.
 		builder.Reset()
-		if err := m.foregroundCompactWithTask(
-			builder, task,
-			log, logger.With(zap.Int("task", i)),
-		); err != nil {
-			return err
+		result, err = m.foregroundCompactWithTask(builder, task,
+			log, logger.With(zap.Int("task", i)))
+		if err != nil {
+			return result, err
 		}
 	}
 
-	return nil
+	return result, nil
 }
 
 func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock(
@@ -679,12 +680,17 @@ func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock(
 	m.maybeBackgroundCompactWithLock()
 }
 
+type compactResult struct {
+	numForeground int
+	numBackground int
+}
+
 func (m *mutableSegments) foregroundCompactWithTask(
 	builder segment.DocumentsBuilder,
 	task compaction.Task,
 	log bool,
 	logger *zap.Logger,
-) error {
+) (compactResult, error) {
 	if log {
 		logger.Debug("start compaction task")
 	}
@@ -712,7 +718,7 @@ func (m *mutableSegments) foregroundCompactWithTask(
 	}
 
 	if err != nil {
-		return err
+		return compactResult{}, err
 	}
 
 	// Add a read through cache for repeated expensive queries against
@@ -728,7 +734,10 @@ func (m *mutableSegments) foregroundCompactWithTask(
 		segments, segment)
 	m.foregroundSegments = result
 
-	return nil
+	return compactResult{
+		numForeground: len(m.foregroundSegments),
+		numBackground: len(m.backgroundSegments),
+	}, nil
 }
 
 func (m *mutableSegments) cleanupForegroundCompactWithLock() {
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index 1957752f75..8ece3843d6 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -489,8 +489,21 @@ const (
 
 // WriteBatchResult returns statistics about the WriteBatch execution.
 type WriteBatchResult struct {
-	NumSuccess int64
-	NumError   int64
+	NumSuccess           int64
+	NumError             int64
+	MutableSegmentsStats MutableSegmentsStats
+}
+
+// MutableSegmentsStats contains metadata about
+// an insertion into mutable segments.
+type MutableSegmentsStats struct {
+	NumForeground int
+	NumBackground int
+}
+
+// Empty returns whether stats is empty or not.
+func (s MutableSegmentsStats) Empty() bool {
+	return s.NumForeground == 0 && s.NumBackground == 0
 }
 
 // BlockTickResult returns statistics about tick.

From 8beb1a1a95842175bcfb6c9c7035921060057a9c Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 19 Nov 2020 01:09:29 -0500
Subject: [PATCH 042/106] Reduce contention for add documents and query cache
 key building

---
 src/dbnode/storage/index.go                   |  14 +-
 src/dbnode/storage/index/aggregate_results.go |   5 +
 src/dbnode/storage/index/block.go             |  15 +-
 src/dbnode/storage/index/results.go           | 134 ++++++++++++++++--
 src/dbnode/storage/index/types.go             |  17 ++-
 .../storage/index/wide_query_results.go       |   5 +
 src/m3ninx/search/query/conjunction.go        |  26 ++--
 src/m3ninx/search/query/disjunction.go        |  28 ++--
 src/m3ninx/search/query/field.go              |  25 ++--
 src/m3ninx/search/query/negation.go           |  29 ++--
 src/m3ninx/search/query/regexp.go             |  28 ++--
 src/m3ninx/search/query/term.go               |  30 ++--
 12 files changed, 229 insertions(+), 127 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 732be75d42..bfb5400e36 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -1858,6 +1858,10 @@ func (i *nsIndex) ensureBlockPresent(blockStart time.Time) (blockPresentResult,
 	return i.ensureBlockPresentWithRLock(blockStart)
 }
 
+func (i *nsIndex) isLatestBlockWithRLock(blockStart time.Time) bool {
+	return i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart)
+}
+
 // ensureBlockPresentWithRLock guarantees an index.Block exists for the specified
 // blockStart, allocating one if it does not. It returns the desired block, or
 // error if it's unable to do so.
@@ -1865,7 +1869,7 @@ func (i *nsIndex) ensureBlockPresentWithRLock(blockStart time.Time) (blockPresen
 	// check if the current latest block matches the required block, this
 	// is the usual path and can short circuit the rest of the logic in this
 	// function in most cases.
-	if i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart) {
+	if i.isLatestBlockWithRLock(blockStart) {
 		return blockPresentResult{
 			block:  i.state.latestBlock,
 			latest: true,
@@ -1894,7 +1898,10 @@ func (i *nsIndex) ensureBlockPresentWithRLock(blockStart time.Time) (blockPresen
 
 	// re-check if exists in the map (another routine did the alloc)
 	if block, ok := i.state.blocksByTime[blockStartNanos]; ok {
-		return blockPresentResult{block: block}, nil
+		return blockPresentResult{
+			block:  block,
+			latest: i.isLatestBlockWithRLock(blockStart),
+		}, nil
 	}
 
 	// ok now we know for sure we have to alloc
@@ -1917,9 +1924,10 @@ func (i *nsIndex) ensureBlockPresentWithRLock(blockStart time.Time) (blockPresen
 
 	// update ordered blockStarts slice, and latestBlock
 	i.updateBlockStartsWithLock()
+
 	return blockPresentResult{
 		block:  block,
-		latest: i.state.latestBlock.StartTime().Equal(blockStart),
+		latest: i.isLatestBlockWithRLock(blockStart),
 	}, nil
 }
 
diff --git a/src/dbnode/storage/index/aggregate_results.go b/src/dbnode/storage/index/aggregate_results.go
index 01c1c3eadc..b3e15ea8d0 100644
--- a/src/dbnode/storage/index/aggregate_results.go
+++ b/src/dbnode/storage/index/aggregate_results.go
@@ -104,6 +104,11 @@ func (r *aggregatedResults) Reset(
 	r.Unlock()
 }
 
+func (r *aggregatedResults) NonConcurrentBuilder() (BaseResultsBuilder, bool) {
+	// Not supported.
+	return nil, false
+}
+
 func (r *aggregatedResults) AddDocuments(batch []doc.Document) (int, int, error) {
 	r.Lock()
 	err := r.addDocumentsBatchWithLock(batch)
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index f3e1474c12..833e82c590 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -536,9 +536,16 @@ func (b *block) queryReadersNoLock(
 	cancellable *xresource.CancellableLifetime,
 	query Query,
 	opts QueryOptions,
-	results BaseResults,
+	queryResults BaseResults,
 	segmentReaders []m3ninxindex.Reader,
 ) (bool, error) {
+	// Use a non concurrent builder for query results if can.
+	results, ok := queryResults.NonConcurrentBuilder()
+	if !ok {
+		// Fall back to using the query results as builder.
+		results = queryResults
+	}
+
 	exec := executor.NewExecutor(segmentReaders)
 
 	// Make sure if we don't register to close the executor later
@@ -575,8 +582,8 @@ func (b *block) queryReadersNoLock(
 
 	var (
 		iterCloser = safeCloser{closable: iter}
-		size       = results.Size()
-		docsCount  = results.TotalDocsCount()
+		size       = queryResults.Size()
+		docsCount  = queryResults.TotalDocsCount()
 		docsPool   = b.opts.DocumentArrayPool()
 		batch      = docsPool.Get()
 		batchSize  = cap(batch)
@@ -636,7 +643,7 @@ func (b *block) closeAsyncNoLock(closer io.Closer) {
 
 func (b *block) addQueryResultsNoLock(
 	cancellable *xresource.CancellableLifetime,
-	results BaseResults,
+	results BaseResultsBuilder,
 	batch []doc.Document,
 ) ([]doc.Document, int, int, error) {
 	// update recently queried docs to monitor memory.
diff --git a/src/dbnode/storage/index/results.go b/src/dbnode/storage/index/results.go
index b0c4372db3..a8b702143e 100644
--- a/src/dbnode/storage/index/results.go
+++ b/src/dbnode/storage/index/results.go
@@ -41,8 +41,13 @@ var (
 type results struct {
 	sync.RWMutex
 
-	nsID ident.ID
-	opts QueryResultsOptions
+	parent *results
+
+	nsID      ident.ID
+	opts      QueryResultsOptions
+	indexOpts Options
+
+	subResults []*results
 
 	resultsMap     *ResultsMap
 	totalDocsCount int
@@ -60,9 +65,18 @@ func NewQueryResults(
 	opts QueryResultsOptions,
 	indexOpts Options,
 ) QueryResults {
+	return newQueryResults(namespaceID, opts, indexOpts)
+}
+
+func newQueryResults(
+	namespaceID ident.ID,
+	opts QueryResultsOptions,
+	indexOpts Options,
+) *results {
 	return &results{
 		nsID:       namespaceID,
 		opts:       opts,
+		indexOpts:  indexOpts,
 		resultsMap: newResultsMap(indexOpts.IdentifierPool()),
 		idPool:     indexOpts.IdentifierPool(),
 		bytesPool:  indexOpts.CheckedBytesPool(),
@@ -70,11 +84,27 @@ func NewQueryResults(
 	}
 }
 
-func (r *results) EnforceLimits() bool { return true }
+func (r *results) EnforceLimits() bool {
+	return true
+}
 
 func (r *results) Reset(nsID ident.ID, opts QueryResultsOptions) {
+	r.reset(nil, nsID, opts)
+}
+
+func (r *results) reset(parent *results, nsID ident.ID, opts QueryResultsOptions) {
 	r.Lock()
 
+	// Set parent.
+	r.parent = parent
+
+	// Return all subresults to pools.
+	for i := range r.subResults {
+		r.subResults[i].Finalize()
+		r.subResults[i] = nil
+	}
+	r.subResults = r.subResults[:0]
+
 	// Finalize existing held nsID.
 	if r.nsID != nil {
 		r.nsID.Finalize()
@@ -102,24 +132,61 @@ func (r *results) Reset(nsID ident.ID, opts QueryResultsOptions) {
 	r.Unlock()
 }
 
+func (r *results) NonConcurrentBuilder() (BaseResultsBuilder, bool) {
+	subResult := r.pool.Get().(*results)
+	subResult.reset(r, r.nsID, r.opts)
+
+	r.Lock()
+	r.subResults = append(r.subResults, subResult)
+	r.Unlock()
+
+	return subResult, true
+}
+
 // NB: If documents with duplicate IDs are added, they are simply ignored and
 // the first document added with an ID is returned.
 func (r *results) AddDocuments(batch []doc.Document) (int, int, error) {
+	var size, docsCount int
+
 	r.Lock()
 	err := r.addDocumentsBatchWithLock(batch)
-	size := r.resultsMap.Len()
-	docsCount := r.totalDocsCount + len(batch)
-	r.totalDocsCount = docsCount
+	parent := r.parent
+	if parent == nil {
+		size, docsCount = r.statsWithRLock()
+	}
 	r.Unlock()
+
+	if parent == nil {
+		return size, docsCount, err
+	}
+
+	// If a child, need to aggregate the size and docs count.
+	parent.RLock()
+	size, docsCount = parent.statsWithRLock()
+	parent.RUnlock()
+
 	return size, docsCount, err
 }
 
+func (r *results) statsWithRLock() (size int, docsCount int) {
+	size = r.resultsMap.Len()
+	docsCount = r.totalDocsCount
+	for _, subResult := range r.subResults {
+		subResult.RLock()
+		size += subResult.resultsMap.Len()
+		docsCount += subResult.totalDocsCount
+		subResult.RUnlock()
+	}
+	return
+}
+
 func (r *results) addDocumentsBatchWithLock(batch []doc.Document) error {
 	for i := range batch {
 		_, size, err := r.addDocumentWithLock(batch[i])
 		if err != nil {
 			return err
 		}
+		r.totalDocsCount++
 		if r.opts.SizeLimit > 0 && size >= r.opts.SizeLimit {
 			// Early return if limit enforced and we hit our limit.
 			break
@@ -162,25 +229,68 @@ func (r *results) Namespace() ident.ID {
 	return v
 }
 
+func (r *results) mergeSubResultWithLock(subResult *results) {
+	subResult.Lock()
+	defer subResult.Unlock()
+
+	if r.resultsMap.Len() == 0 {
+		// Just swap ownership of this results map since this subresult
+		// has results and the current results does not.
+		currResultsMap := r.resultsMap
+		r.resultsMap = subResult.resultsMap
+		subResult.resultsMap = currResultsMap
+		return
+	}
+
+	for _, elem := range subResult.resultsMap.Iter() {
+		key := elem.Key()
+		if r.resultsMap.Contains(key) {
+			// Already contained.
+			continue
+		}
+		// It is assumed that the document is valid for the lifetime of the
+		// index results.
+		r.resultsMap.SetUnsafe(key, elem.Value(), resultMapNoFinalizeOpts)
+	}
+
+	// Reset all keys in the subresult map next, this will finalize the keys
+	// and make sure the values are not closed on next reset.
+	subResult.resultsMap.Reset()
+}
+
 func (r *results) Map() *ResultsMap {
-	r.RLock()
+	r.Lock()
+
+	// Copy any subresults into final result.
+	for _, subResult := range r.subResults {
+		r.mergeSubResultWithLock(subResult)
+	}
+
+	// Finalize and reset sub results now merged.
+	for i := range r.subResults {
+		r.subResults[i].Finalize()
+		r.subResults[i] = nil
+	}
+	r.subResults = r.subResults[:0]
+
 	v := r.resultsMap
-	r.RUnlock()
+
+	r.Unlock()
 	return v
 }
 
 func (r *results) Size() int {
 	r.RLock()
-	v := r.resultsMap.Len()
+	size, _ := r.statsWithRLock()
 	r.RUnlock()
-	return v
+	return size
 }
 
 func (r *results) TotalDocsCount() int {
 	r.RLock()
-	count := r.totalDocsCount
+	_, docsCount := r.statsWithRLock()
 	r.RUnlock()
-	return count
+	return docsCount
 }
 
 func (r *results) Finalize() {
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index 8ece3843d6..b8962d9068 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -146,6 +146,8 @@ type AggregateQueryResult struct {
 // synchronized when access to the results set is used as documented by the
 // methods.
 type BaseResults interface {
+	BaseResultsBuilder
+
 	// Namespace returns the namespace associated with the result.
 	Namespace() ident.ID
 
@@ -155,6 +157,17 @@ type BaseResults interface {
 	// TotalDocsCount returns the total number of documents observed.
 	TotalDocsCount() int
 
+	// NonConcurrentBuilder returns a builder that should not be used with
+	// concurrency, will return false as second parameter if not possible.
+	NonConcurrentBuilder() (BaseResultsBuilder, bool)
+
+	// Finalize releases any resources held by the Results object,
+	// including returning it to a backing pool.
+	Finalize()
+}
+
+// BaseResultsBuilder is a results builder.
+type BaseResultsBuilder interface {
 	// EnforceLimits returns whether this should enforce and increment limits.
 	EnforceLimits() bool
 
@@ -164,10 +177,6 @@ type BaseResults interface {
 	// TODO(r): We will need to change this behavior once index fields are
 	// mutable and the most recent need to shadow older entries.
 	AddDocuments(batch []doc.Document) (size, docsCount int, err error)
-
-	// Finalize releases any resources held by the Results object,
-	// including returning it to a backing pool.
-	Finalize()
 }
 
 // QueryResults is a collection of results for a query, it is synchronized
diff --git a/src/dbnode/storage/index/wide_query_results.go b/src/dbnode/storage/index/wide_query_results.go
index 92903b2643..854ac4c7a9 100644
--- a/src/dbnode/storage/index/wide_query_results.go
+++ b/src/dbnode/storage/index/wide_query_results.go
@@ -98,6 +98,11 @@ func (r *wideResults) EnforceLimits() bool {
 	return false
 }
 
+func (r *wideResults) NonConcurrentBuilder() (BaseResultsBuilder, bool) {
+	// Not supported.
+	return nil, false
+}
+
 func (r *wideResults) AddDocuments(batch []doc.Document) (int, int, error) {
 	var size, totalDocsCount int
 	r.RLock()
diff --git a/src/m3ninx/search/query/conjunction.go b/src/m3ninx/search/query/conjunction.go
index d56d32afda..33c896dd26 100644
--- a/src/m3ninx/search/query/conjunction.go
+++ b/src/m3ninx/search/query/conjunction.go
@@ -22,7 +22,6 @@ package query
 
 import (
 	"strings"
-	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -31,8 +30,7 @@ import (
 
 // ConjuctionQuery finds documents which match at least one of the given queries.
 type ConjuctionQuery struct {
-	sync.Mutex
-	strValue  string
+	str       string
 	queries   []search.Query
 	negations []search.Query
 }
@@ -62,10 +60,15 @@ func NewConjunctionQuery(queries []search.Query) search.Query {
 		ns = ns[1:]
 	}
 
-	return &ConjuctionQuery{
+	q := &ConjuctionQuery{
 		queries:   qs,
 		negations: ns,
 	}
+	// NB(r): Calculate string value up front so
+	// not allocated every time String() is called to determine
+	// the cache key.
+	q.str = q.string()
+	return q
 }
 
 // Searcher returns a searcher over the provided readers.
@@ -154,17 +157,10 @@ func (q *ConjuctionQuery) ToProto() *querypb.Query {
 }
 
 func (q *ConjuctionQuery) String() string {
-	q.Lock()
-	str := q.stringWithLock()
-	q.Unlock()
-	return str
+	return q.str
 }
 
-func (q *ConjuctionQuery) stringWithLock() string {
-	if q.strValue != "" {
-		return q.strValue
-	}
-
+func (q *ConjuctionQuery) string() string {
 	var str strings.Builder
 	str.WriteString("conjunction(")
 	join(&str, q.queries)
@@ -173,7 +169,5 @@ func (q *ConjuctionQuery) stringWithLock() string {
 		joinNegation(&str, q.negations)
 	}
 	str.WriteRune(')')
-
-	q.strValue = str.String()
-	return q.strValue
+	return str.String()
 }
diff --git a/src/m3ninx/search/query/disjunction.go b/src/m3ninx/search/query/disjunction.go
index fe5905841d..c0c6acd452 100644
--- a/src/m3ninx/search/query/disjunction.go
+++ b/src/m3ninx/search/query/disjunction.go
@@ -22,7 +22,6 @@ package query
 
 import (
 	"strings"
-	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -31,9 +30,8 @@ import (
 
 // DisjuctionQuery finds documents which match at least one of the given queries.
 type DisjuctionQuery struct {
-	sync.Mutex
-	strValue string
-	queries  []search.Query
+	str     string
+	queries []search.Query
 }
 
 // NewDisjunctionQuery constructs a new query which matches documents that match any
@@ -50,9 +48,14 @@ func NewDisjunctionQuery(queries []search.Query) search.Query {
 
 		qs = append(qs, query)
 	}
-	return &DisjuctionQuery{
+	q := &DisjuctionQuery{
 		queries: qs,
 	}
+	// NB(r): Calculate string value up front so
+	// not allocated every time String() is called to determine
+	// the cache key.
+	q.str = q.string()
+	return q
 }
 
 // Searcher returns a searcher over the provided readers.
@@ -115,22 +118,13 @@ func (q *DisjuctionQuery) ToProto() *querypb.Query {
 }
 
 func (q *DisjuctionQuery) String() string {
-	q.Lock()
-	str := q.stringWithLock()
-	q.Unlock()
-	return str
+	return q.str
 }
 
-func (q *DisjuctionQuery) stringWithLock() string {
-	if q.strValue != "" {
-		return q.strValue
-	}
-
+func (q *DisjuctionQuery) string() string {
 	var str strings.Builder
 	str.WriteString("disjunction(")
 	join(&str, q.queries)
 	str.WriteRune(')')
-
-	q.strValue = str.String()
-	return q.strValue
+	return str.String()
 }
diff --git a/src/m3ninx/search/query/field.go b/src/m3ninx/search/query/field.go
index a5da0fa671..958805cc53 100644
--- a/src/m3ninx/search/query/field.go
+++ b/src/m3ninx/search/query/field.go
@@ -23,7 +23,6 @@ package query
 import (
 	"bytes"
 	"strings"
-	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -32,16 +31,17 @@ import (
 
 // FieldQuery finds document which have the given field exactly.
 type FieldQuery struct {
-	sync.Mutex
-	strValue string
-	field    []byte
+	str   string
+	field []byte
 }
 
 // NewFieldQuery constructs a new FieldQuery for the given field.
 func NewFieldQuery(field []byte) search.Query {
-	return &FieldQuery{
+	q := &FieldQuery{
 		field: field,
 	}
+	q.str = q.string()
+	return q
 }
 
 // Field returns the field []byte.
@@ -81,22 +81,13 @@ func (q *FieldQuery) ToProto() *querypb.Query {
 }
 
 func (q *FieldQuery) String() string {
-	q.Lock()
-	str := q.stringWithLock()
-	q.Unlock()
-	return str
+	return q.str
 }
 
-func (q *FieldQuery) stringWithLock() string {
-	if q.strValue != "" {
-		return q.strValue
-	}
-
+func (q *FieldQuery) string() string {
 	var str strings.Builder
 	str.WriteString("field(")
 	str.Write(q.field)
 	str.WriteRune(')')
-
-	q.strValue = str.String()
-	return q.strValue
+	return str.String()
 }
diff --git a/src/m3ninx/search/query/negation.go b/src/m3ninx/search/query/negation.go
index 3dff751724..3e2b7389b5 100644
--- a/src/m3ninx/search/query/negation.go
+++ b/src/m3ninx/search/query/negation.go
@@ -22,7 +22,6 @@ package query
 
 import (
 	"strings"
-	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -31,16 +30,17 @@ import (
 
 // NegationQuery finds document which do not match a given query.
 type NegationQuery struct {
-	sync.Mutex
-	strValue string
-	query    search.Query
+	str   string
+	query search.Query
 }
 
 // NewNegationQuery constructs a new NegationQuery for the given query.
-func NewNegationQuery(q search.Query) search.Query {
-	return &NegationQuery{
-		query: q,
+func NewNegationQuery(query search.Query) search.Query {
+	q := &NegationQuery{
+		query: query,
 	}
+	q.str = q.string()
+	return q
 }
 
 // Searcher returns a searcher over the provided readers.
@@ -78,22 +78,13 @@ func (q *NegationQuery) ToProto() *querypb.Query {
 }
 
 func (q *NegationQuery) String() string {
-	q.Lock()
-	str := q.stringWithLock()
-	q.Unlock()
-	return str
+	return q.str
 }
 
-func (q *NegationQuery) stringWithLock() string {
-	if q.strValue != "" {
-		return q.strValue
-	}
-
+func (q *NegationQuery) string() string {
 	var str strings.Builder
 	str.WriteString("negation(")
 	str.WriteString(q.query.String())
 	str.WriteRune(')')
-
-	q.strValue = str.String()
-	return q.strValue
+	return str.String()
 }
diff --git a/src/m3ninx/search/query/regexp.go b/src/m3ninx/search/query/regexp.go
index 6a8185c16f..6cf814de62 100644
--- a/src/m3ninx/search/query/regexp.go
+++ b/src/m3ninx/search/query/regexp.go
@@ -23,7 +23,6 @@ package query
 import (
 	"bytes"
 	"strings"
-	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/index"
@@ -33,8 +32,7 @@ import (
 
 // RegexpQuery finds documents which match the given regular expression.
 type RegexpQuery struct {
-	sync.Mutex
-	strValue string
+	str      string
 	field    []byte
 	regexp   []byte
 	compiled index.CompiledRegex
@@ -47,11 +45,16 @@ func NewRegexpQuery(field, regexp []byte) (search.Query, error) {
 		return nil, err
 	}
 
-	return &RegexpQuery{
+	q := &RegexpQuery{
 		field:    field,
 		regexp:   regexp,
 		compiled: compiled,
-	}, nil
+	}
+	// NB(r): Calculate string value up front so
+	// not allocated every time String() is called to determine
+	// the cache key.
+	q.str = q.string()
+	return q, nil
 }
 
 // MustCreateRegexpQuery is like NewRegexpQuery but panics if the query cannot be created.
@@ -96,24 +99,15 @@ func (q *RegexpQuery) ToProto() *querypb.Query {
 }
 
 func (q *RegexpQuery) String() string {
-	q.Lock()
-	str := q.stringWithLock()
-	q.Unlock()
-	return str
+	return q.str
 }
 
-func (q *RegexpQuery) stringWithLock() string {
-	if q.strValue != "" {
-		return q.strValue
-	}
-
+func (q *RegexpQuery) string() string {
 	var str strings.Builder
 	str.WriteString("regexp(")
 	str.Write(q.field)
 	str.WriteRune(',')
 	str.Write(q.regexp)
 	str.WriteRune(')')
-
-	q.strValue = str.String()
-	return q.strValue
+	return str.String()
 }
diff --git a/src/m3ninx/search/query/term.go b/src/m3ninx/search/query/term.go
index 3488d2d0a6..b5d10cb63c 100644
--- a/src/m3ninx/search/query/term.go
+++ b/src/m3ninx/search/query/term.go
@@ -23,7 +23,6 @@ package query
 import (
 	"bytes"
 	"strings"
-	"sync"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -32,18 +31,22 @@ import (
 
 // TermQuery finds document which match the given term exactly.
 type TermQuery struct {
-	sync.Mutex
-	strValue string
-	field    []byte
-	term     []byte
+	str   string
+	field []byte
+	term  []byte
 }
 
 // NewTermQuery constructs a new TermQuery for the given field and term.
 func NewTermQuery(field, term []byte) search.Query {
-	return &TermQuery{
+	q := &TermQuery{
 		field: field,
 		term:  term,
 	}
+	// NB(r): Calculate string value up front so
+	// not allocated every time String() is called to determine
+	// the cache key.
+	q.str = q.string()
+	return q
 }
 
 // Searcher returns a searcher over the provided readers.
@@ -79,24 +82,15 @@ func (q *TermQuery) ToProto() *querypb.Query {
 }
 
 func (q *TermQuery) String() string {
-	q.Lock()
-	str := q.stringWithLock()
-	q.Unlock()
-	return str
+	return q.str
 }
 
-func (q *TermQuery) stringWithLock() string {
-	if q.strValue != "" {
-		return q.strValue
-	}
-
+func (q *TermQuery) string() string {
 	var str strings.Builder
 	str.WriteString("term(")
 	str.Write(q.field)
 	str.WriteRune(',')
 	str.Write(q.term)
 	str.WriteRune(')')
-
-	q.strValue = str.String()
-	return q.strValue
+	return str.String()
 }

From 60579697dc9b5099a6216b69916858edceff324b Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 19 Nov 2020 01:35:41 -0500
Subject: [PATCH 043/106] Only acquire lock during stats update/read

---
 src/dbnode/storage/index/results.go | 52 +++++++++++++++++------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/src/dbnode/storage/index/results.go b/src/dbnode/storage/index/results.go
index a8b702143e..a334d95a2e 100644
--- a/src/dbnode/storage/index/results.go
+++ b/src/dbnode/storage/index/results.go
@@ -52,6 +52,10 @@ type results struct {
 	resultsMap     *ResultsMap
 	totalDocsCount int
 
+	statsLock      sync.RWMutex
+	statsSize      int
+	statsDocsCount int
+
 	idPool    ident.Pool
 	bytesPool pool.CheckedBytesPool
 
@@ -146,37 +150,47 @@ func (r *results) NonConcurrentBuilder() (BaseResultsBuilder, bool) {
 // NB: If documents with duplicate IDs are added, they are simply ignored and
 // the first document added with an ID is returned.
 func (r *results) AddDocuments(batch []doc.Document) (int, int, error) {
-	var size, docsCount int
+	if r.parent == nil {
+		// Locking only if parent, otherwise using non-concurrent safe builder.
+		r.Lock()
+	}
 
-	r.Lock()
 	err := r.addDocumentsBatchWithLock(batch)
 	parent := r.parent
-	if parent == nil {
-		size, docsCount = r.statsWithRLock()
+	size, docsCount := r.resultsMap.Len(), r.totalDocsCount
+
+	if r.parent == nil {
+		// Locking only if parent, otherwise using non-concurrent safe builder.
+		r.Unlock()
 	}
-	r.Unlock()
+
+	// Update stats using just the stats lock to avoid contention.
+	r.statsLock.Lock()
+	r.statsSize = size
+	r.statsDocsCount = docsCount
+	r.statsLock.Unlock()
 
 	if parent == nil {
 		return size, docsCount, err
 	}
 
 	// If a child, need to aggregate the size and docs count.
-	parent.RLock()
-	size, docsCount = parent.statsWithRLock()
-	parent.RUnlock()
+	size, docsCount = parent.statsNoLock()
 
 	return size, docsCount, err
 }
 
-func (r *results) statsWithRLock() (size int, docsCount int) {
-	size = r.resultsMap.Len()
-	docsCount = r.totalDocsCount
+func (r *results) statsNoLock() (size int, docsCount int) {
+	r.statsLock.RLock()
+	size = r.statsSize
+	docsCount = r.statsDocsCount
 	for _, subResult := range r.subResults {
-		subResult.RLock()
-		size += subResult.resultsMap.Len()
-		docsCount += subResult.totalDocsCount
-		subResult.RUnlock()
+		subResult.statsLock.RLock()
+		size += subResult.statsSize
+		docsCount += subResult.statsDocsCount
+		subResult.statsLock.RUnlock()
 	}
+	r.statsLock.RUnlock()
 	return
 }
 
@@ -280,16 +294,12 @@ func (r *results) Map() *ResultsMap {
 }
 
 func (r *results) Size() int {
-	r.RLock()
-	size, _ := r.statsWithRLock()
-	r.RUnlock()
+	size, _ := r.statsNoLock()
 	return size
 }
 
 func (r *results) TotalDocsCount() int {
-	r.RLock()
-	_, docsCount := r.statsWithRLock()
-	r.RUnlock()
+	_, docsCount := r.statsNoLock()
 	return docsCount
 }
 

From 4af3abac87606d516fe20f55785cf575f53c7bc6 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 19 Nov 2020 02:33:18 -0500
Subject: [PATCH 044/106] Fix race with statsNoLock

---
 src/dbnode/storage/index/results.go | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/dbnode/storage/index/results.go b/src/dbnode/storage/index/results.go
index a334d95a2e..826d5f5a96 100644
--- a/src/dbnode/storage/index/results.go
+++ b/src/dbnode/storage/index/results.go
@@ -103,11 +103,10 @@ func (r *results) reset(parent *results, nsID ident.ID, opts QueryResultsOptions
 	r.parent = parent
 
 	// Return all subresults to pools.
-	for i := range r.subResults {
-		r.subResults[i].Finalize()
-		r.subResults[i] = nil
+	for _, subResult := range r.subResults {
+		subResult.Finalize()
 	}
-	r.subResults = r.subResults[:0]
+	r.subResults = nil
 
 	// Finalize existing held nsID.
 	if r.nsID != nil {
@@ -184,13 +183,20 @@ func (r *results) statsNoLock() (size int, docsCount int) {
 	r.statsLock.RLock()
 	size = r.statsSize
 	docsCount = r.statsDocsCount
-	for _, subResult := range r.subResults {
+	r.statsLock.RUnlock()
+
+	// Take snapshot of subresults with RLock.
+	r.RLock()
+	subResults := r.subResults[:]
+	r.RUnlock()
+
+	for _, subResult := range subResults {
 		subResult.statsLock.RLock()
 		size += subResult.statsSize
 		docsCount += subResult.statsDocsCount
 		subResult.statsLock.RUnlock()
 	}
-	r.statsLock.RUnlock()
+
 	return
 }
 
@@ -281,11 +287,10 @@ func (r *results) Map() *ResultsMap {
 	}
 
 	// Finalize and reset sub results now merged.
-	for i := range r.subResults {
-		r.subResults[i].Finalize()
-		r.subResults[i] = nil
+	for _, subResult := range r.subResults {
+		subResult.Finalize()
 	}
-	r.subResults = r.subResults[:0]
+	r.subResults = nil
 
 	v := r.resultsMap
 

From 178f02f75a82bc6b41b5a3618c6a89d3f8c096f7 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 19 Nov 2020 02:35:05 -0500
Subject: [PATCH 045/106] Always lock even if using non-concurrent builder

---
 src/dbnode/storage/index/results.go | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/dbnode/storage/index/results.go b/src/dbnode/storage/index/results.go
index 826d5f5a96..0eacfac4ae 100644
--- a/src/dbnode/storage/index/results.go
+++ b/src/dbnode/storage/index/results.go
@@ -149,19 +149,11 @@ func (r *results) NonConcurrentBuilder() (BaseResultsBuilder, bool) {
 // NB: If documents with duplicate IDs are added, they are simply ignored and
 // the first document added with an ID is returned.
 func (r *results) AddDocuments(batch []doc.Document) (int, int, error) {
-	if r.parent == nil {
-		// Locking only if parent, otherwise using non-concurrent safe builder.
-		r.Lock()
-	}
-
+	r.Lock()
 	err := r.addDocumentsBatchWithLock(batch)
 	parent := r.parent
 	size, docsCount := r.resultsMap.Len(), r.totalDocsCount
-
-	if r.parent == nil {
-		// Locking only if parent, otherwise using non-concurrent safe builder.
-		r.Unlock()
-	}
+	r.Unlock()
 
 	// Update stats using just the stats lock to avoid contention.
 	r.statsLock.Lock()

From ccf9035b17c3bb46971a4e1ef923138e84561815 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 8 Dec 2020 18:46:41 -0500
Subject: [PATCH 046/106] Force cold writes enabled

---
 src/dbnode/storage/options.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dbnode/storage/options.go b/src/dbnode/storage/options.go
index 8d9ab09c98..ac5b4b89c7 100644
--- a/src/dbnode/storage/options.go
+++ b/src/dbnode/storage/options.go
@@ -254,6 +254,7 @@ func newOptions(poolOpts pool.ObjectPoolOptions) Options {
 		wideBatchSize:                   defaultWideBatchSize,
 		namespaceHooks:                  &noopNamespaceHooks{},
 		tileAggregator:                  &noopTileAggregator{},
+		forceColdWritesEnabled:          true,
 	}
 	return o.SetEncodingM3TSZPooled()
 }

From 0483b502d65d459c39c5b5eff7dfd1abe0627348 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 8 Dec 2020 18:46:41 -0500
Subject: [PATCH 047/106] Force cold writes enabled

---
 src/dbnode/server/server.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dbnode/server/server.go b/src/dbnode/server/server.go
index fb90ab2e19..f0c8e12c27 100644
--- a/src/dbnode/server/server.go
+++ b/src/dbnode/server/server.go
@@ -656,7 +656,7 @@ func Run(runOpts RunOptions) {
 			InstrumentOpts:         iopts,
 			HashingSeed:            cfg.Hashing.Seed,
 			NewDirectoryMode:       newDirectoryMode,
-			ForceColdWritesEnabled: runOpts.StorageOptions.ForceColdWritesEnabled,
+			ForceColdWritesEnabled: true,
 		})
 		if err != nil {
 			logger.Fatal("could not initialize dynamic config", zap.Error(err))
@@ -667,7 +667,7 @@ func Run(runOpts RunOptions) {
 		envCfgResults, err = envConfig.Configure(environment.ConfigurationParameters{
 			InstrumentOpts:         iopts,
 			HostID:                 hostID,
-			ForceColdWritesEnabled: runOpts.StorageOptions.ForceColdWritesEnabled,
+			ForceColdWritesEnabled: true,
 		})
 		if err != nil {
 			logger.Fatal("could not initialize static config", zap.Error(err))

From 2bbf6ba3236df2780b97b6bf39e01ed69ba69115 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 8 Dec 2020 18:52:26 -0500
Subject: [PATCH 048/106] Use fast IntersectsAny code path for aggregate
 queries

---
 .../storage/index/fields_terms_iterator.go    |  64 ++++--
 .../postings/roaring/bitmap_multi_readonly.go |  41 ----
 .../roaring/bitmap_multi_readonly_test.go     |  40 +++-
 .../postings/roaring/bitmap_readonly.go       | 211 ++++++++++++++++++
 4 files changed, 292 insertions(+), 64 deletions(-)

diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index e6d38f5a26..61e0a2647f 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -21,7 +21,9 @@
 package index
 
 import (
+	"bytes"
 	"errors"
+	"fmt"
 
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment"
@@ -76,9 +78,7 @@ type fieldsAndTermsIter struct {
 	}
 
 	restrictByPostingsBitmap *pilosaroaring.Bitmap
-
-	restrictByPostings          postings.List
-	restrictByPostingsIntersect *roaring.ReadOnlyBitmapIntersectCheck
+	restrictByPostings       *roaring.ReadOnlyBitmap
 }
 
 var (
@@ -96,12 +96,16 @@ func newFieldsAndTermsIterator(
 	reader segment.Reader,
 	opts fieldsAndTermsIteratorOpts,
 ) (fieldsAndTermsIterator, error) {
-	var restrictByPostingsIntersect *roaring.ReadOnlyBitmapIntersectCheck
+	var restrictByPostings *roaring.ReadOnlyBitmap
 	if index.MigrationReadOnlyPostings() {
-		restrictByPostingsIntersect = roaring.NewReadOnlyBitmapIntersectCheck()
+		var err error
+		restrictByPostings, err = roaring.NewReadOnlyBitmap(nil)
+		if err != nil {
+			return nil, err
+		}
 	}
 	iter := &fieldsAndTermsIter{
-		restrictByPostingsIntersect: restrictByPostingsIntersect,
+		restrictByPostings: restrictByPostings,
 	}
 	err := iter.Reset(reader, opts)
 	if err != nil {
@@ -115,7 +119,7 @@ func (fti *fieldsAndTermsIter) Reset(
 	opts fieldsAndTermsIteratorOpts,
 ) error {
 	// Keep restrict by postings intersect check until completely closed.
-	restrictByPostingsIntersect := fti.restrictByPostingsIntersect
+	restrictByPostings := fti.restrictByPostings
 
 	// Close per use items.
 	if multiErr := fti.closePerUse(); multiErr.FinalError() != nil {
@@ -126,7 +130,8 @@ func (fti *fieldsAndTermsIter) Reset(
 	*fti = fieldsAndTermsIterZeroed
 
 	// Restore restrict by postings intersect check.
-	fti.restrictByPostingsIntersect = restrictByPostingsIntersect
+	fti.restrictByPostings = restrictByPostings
+	fti.restrictByPostings.Reset(nil)
 
 	// Set per use fields.
 	fti.reader = reader
@@ -159,7 +164,26 @@ func (fti *fieldsAndTermsIter) Reset(
 
 	// Hold onto the postings bitmap to intersect against on a per term basis.
 	if index.MigrationReadOnlyPostings() {
-		fti.restrictByPostings = pl
+		// Copy into a single flat read only bitmap so that can do fast intersect.
+		var buff bytes.Buffer
+		bitmap := pilosaroaring.NewBitmap()
+		iter := pl.Iterator()
+		for iter.Next() {
+			bitmap.DirectAdd(uint64(iter.Current()))
+		}
+		if _, err := bitmap.WriteTo(&buff); err != nil {
+			return err
+		}
+		if err := iter.Err(); err != nil {
+			return err
+		}
+		if err := iter.Close(); err != nil {
+			return err
+		}
+
+		if err := fti.restrictByPostings.Reset(buff.Bytes()); err != nil {
+			return err
+		}
 	} else {
 		var ok bool
 		fti.restrictByPostingsBitmap, ok = roaring.BitmapFromPostingsList(pl)
@@ -187,12 +211,12 @@ func (fti *fieldsAndTermsIter) setNextField() bool {
 			// Check term isn't part of at least some of the documents we're
 			// restricted to providing results for based on intersection
 			// count.
-			restrictBy := fti.restrictByPostings
-			match, err := fti.restrictByPostingsIntersect.Intersects(restrictBy, curr)
-			if err != nil {
-				fti.err = err
+			curr, ok := roaring.ReadOnlyBitmapFromPostingsList(curr)
+			if !ok {
+				fti.err = fmt.Errorf("next fields postings not read only bitmap")
 				return false
 			}
+			match := fti.restrictByPostings.IntersectsAny(curr)
 			if !match {
 				// No match.
 				continue
@@ -278,12 +302,11 @@ func (fti *fieldsAndTermsIter) nextTermsIterResult() (bool, error) {
 			// Check term isn't part of at least some of the documents we're
 			// restricted to providing results for based on intersection
 			// count.
-			restrictBy := fti.restrictByPostings
-			curr := fti.current.postings
-			match, err := fti.restrictByPostingsIntersect.Intersects(restrictBy, curr)
-			if err != nil {
-				return false, err
+			curr, ok := roaring.ReadOnlyBitmapFromPostingsList(fti.current.postings)
+			if !ok {
+				return false, fmt.Errorf("next terms postings not read only bitmap")
 			}
+			match := fti.restrictByPostings.IntersectsAny(curr)
 			if match {
 				// Matches, this is next result.
 				return true, nil
@@ -299,7 +322,7 @@ func (fti *fieldsAndTermsIter) nextTermsIterResult() (bool, error) {
 				return false, errUnpackBitmapFromPostingsList
 			}
 
-			// Check term isn part of at least some of the documents we're
+			// Check term isn't part of at least some of the documents we're
 			// restricted to providing results for based on intersection
 			// count.
 			// Note: IntersectionCount is significantly faster than intersecting and
@@ -354,9 +377,6 @@ func (fti *fieldsAndTermsIter) closePerUse() xerrors.MultiError {
 
 func (fti *fieldsAndTermsIter) Close() error {
 	multiErr := fti.closePerUse()
-	if fti.restrictByPostingsIntersect != nil {
-		multiErr = multiErr.Add(fti.restrictByPostingsIntersect.Close())
-	}
 	multiErr = multiErr.Add(fti.Reset(nil, fieldsAndTermsIteratorOpts{}))
 	return multiErr.FinalError()
 }
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 4355801b6a..2f6103f76e 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -85,47 +85,6 @@ func IsReadOnlyPostingsList(pl postings.List) bool {
 	return ok
 }
 
-// ReadOnlyBitmapIntersectCheck is a check that can be repeated
-// against read only bitmaps without allocations.
-type ReadOnlyBitmapIntersectCheck struct {
-	multiBitmapIterator *multiBitmapIterator
-	intersect           []readOnlyIterable
-}
-
-// NewReadOnlyBitmapIntersectCheck creates a new bitmap intersect checker,
-// it is zero allocation once allocated to compare two bitmaps.
-func NewReadOnlyBitmapIntersectCheck() *ReadOnlyBitmapIntersectCheck {
-	return &ReadOnlyBitmapIntersectCheck{
-		multiBitmapIterator: newMultiBitmapIterator(multiBitmapOptions{}),
-		intersect:           make([]readOnlyIterable, 2),
-	}
-}
-
-// Intersects returns whether two posting lists intersect or not.
-func (c *ReadOnlyBitmapIntersectCheck) Intersects(a, b postings.List) (bool, error) {
-	if pl, ok := a.(readOnlyIterable); ok {
-		c.intersect[0] = pl
-	} else {
-		return false, ErrNotReadOnlyBitmap
-	}
-	if pl, ok := b.(readOnlyIterable); ok {
-		c.intersect[1] = pl
-	} else {
-		return false, ErrNotReadOnlyBitmap
-	}
-
-	c.multiBitmapIterator.Reset(multiBitmapOptions{
-		op:        multiBitmapOpIntersect,
-		intersect: c.intersect,
-	})
-	return c.multiBitmapIterator.Next(), nil
-}
-
-// Close will close the intersect checker.
-func (c *ReadOnlyBitmapIntersectCheck) Close() error {
-	return c.multiBitmapIterator.Close()
-}
-
 var _ postings.List = (*multiBitmap)(nil)
 var _ readOnlyIterable = (*multiBitmap)(nil)
 
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
index 970ad84337..f3e073abfb 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly_test.go
@@ -147,17 +147,26 @@ func TestMultiBitmap(t *testing.T) {
 		}
 		for i := 0; i < each; i++ {
 			t.Run(fmt.Sprintf("i=%d, test=+%v", i, test), func(t *testing.T) {
+				var bitmapsRW, bitmapsRO []postings.List
+
 				allReadOnly, err := NewReadOnlyRangePostingsList(0, uint64(test.insertRange))
 				require.NoError(t, err)
 
 				reg, regReadOnly :=
 					genRandBitmapsAndReadOnlyBitmaps(t, test.numRegular, genOpts)
+				bitmapsRW, bitmapsRO = append(bitmapsRW, reg...), append(bitmapsRO, regReadOnly...)
+
 				union, unionReadOnly :=
 					genRandBitmapsAndReadOnlyBitmaps(t, test.numUnion, genOpts)
+				bitmapsRW, bitmapsRO = append(bitmapsRW, union...), append(bitmapsRO, unionReadOnly...)
+
 				negate, negateReadOnly :=
 					genRandBitmapsAndReadOnlyBitmaps(t, test.numNegate, genOpts)
+				bitmapsRW, bitmapsRO = append(bitmapsRW, negate...), append(bitmapsRO, negateReadOnly...)
+
 				negateUnion, negateUnionReadOnly :=
 					genRandBitmapsAndReadOnlyBitmaps(t, test.numNegateUnion, genOpts)
+				bitmapsRW, bitmapsRO = append(bitmapsRW, negateUnion...), append(bitmapsRO, negateUnionReadOnly...)
 
 				// First create the inner multi-bitmaps.
 				multiInner := concat(regReadOnly)
@@ -223,7 +232,7 @@ func TestMultiBitmap(t *testing.T) {
 				equal := postings.Equal(multi, transformed)
 				if !equal {
 					fmt.Printf("negate: %v\n", postingsString(negate[0]))
-					msg := fmt.Sprintf("multi-bitmap: %s\nstandard: %s\n",
+					msg := fmt.Sprintf("multi-bitmap: %s, standard: %s",
 						postingsString(multi), postingsString(transformed))
 
 					if debug := os.Getenv("TEST_DEBUG_DIR"); debug != "" {
@@ -236,6 +245,29 @@ func TestMultiBitmap(t *testing.T) {
 					require.True(t, equal, msg)
 				}
 
+				// Check for IntersectsAny.
+				for i := 0; i < len(bitmapsRW); i++ {
+					for j := 0; j < len(bitmapsRW); j++ {
+						bi := bitmapFromPostings(t, bitmapsRW[i])
+						bj := bitmapFromPostings(t, bitmapsRW[j])
+
+						expected := bi.IntersectionCount(bj) > 0
+
+						roi := bitmapReadOnlyFromPostings(t, bitmapsRO[i])
+						roj := bitmapReadOnlyFromPostings(t, bitmapsRO[j])
+						actual := roi.IntersectsAny(roj)
+
+						equal := expected == actual
+						if !equal {
+							msg := fmt.Sprintf("expect: %v, actual: %v, left: %s, right: %s",
+								expected, actual,
+								postingsString(bitmapsRW[i]), postingsString(bitmapsRW[j]))
+
+							require.Equal(t, equal, msg)
+						}
+					}
+				}
+
 				// Check for contains.
 				// iter := transformed.Iterator()
 				// for iter.Next() {
@@ -278,6 +310,12 @@ func bitmapFromPostings(t *testing.T, pl postings.List) *roaring.Bitmap {
 	return b
 }
 
+func bitmapReadOnlyFromPostings(t *testing.T, pl postings.List) *ReadOnlyBitmap {
+	b, ok := ReadOnlyBitmapFromPostingsList(pl)
+	require.True(t, ok)
+	return b
+}
+
 func lists(list ...postings.List) []postings.List {
 	return list
 }
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index 20d412b004..695bb586da 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -166,6 +166,66 @@ func (b bitmapReadOnlyContainer) contains(v uint16) bool {
 	return (b.values[v/64] & (1 << uint64(v%64))) != 0
 }
 
+func (b bitmapReadOnlyContainer) containsAnyRange(start, end int32) bool {
+	i, j := start/64, end/64
+
+	// Same uint64.
+	if i == j {
+		offi, offj := uint(start%64), uint(64-end%64)
+		return popcount((b.values[i]>>offi)<<(offj+offi)) > 0
+	}
+
+	// At start.
+	if off := uint(start) % 64; off != 0 {
+		if popcount(b.values[i]>>off) > 0 {
+			return true
+		}
+	}
+
+	// Count uint64 in between.
+	for ; i < j; i++ {
+		if popcount(b.values[i]) > 0 {
+			return true
+		}
+	}
+
+	// Count partial ending uint64.
+	if j < int32(len(b.values)) {
+		off := 64 - (uint(end) % 64)
+		if popcount(b.values[j]<<off) > 0 {
+			return true
+		}
+	}
+
+	return false
+}
+
+func popcount(x uint64) uint64 {
+	return uint64(bits.OnesCount64(x))
+}
+
+func (b bitmapReadOnlyContainer) intersectsAnyBitmap(other bitmapReadOnlyContainer) bool {
+	var (
+		ab = b.values[:bitmapN]
+		bb = other.values[:bitmapN]
+	)
+	for i := 0; i < bitmapN; i += 4 {
+		if ab[i]&bb[i] != 0 {
+			return true
+		}
+		if ab[i+1]&bb[i+1] != 0 {
+			return true
+		}
+		if ab[i+2]&bb[i+2] != 0 {
+			return true
+		}
+		if ab[i+3]&bb[i+3] != 0 {
+			return true
+		}
+	}
+	return false
+}
+
 type arrayReadOnlyContainer struct {
 	values []uint16
 }
@@ -178,6 +238,39 @@ func (a arrayReadOnlyContainer) contains(v uint16) bool {
 	return idx < n && a.values[idx] == v
 }
 
+func (a arrayReadOnlyContainer) intersectsAnyArray(other arrayReadOnlyContainer) bool {
+	for i, j := 0, 0; i < len(a.values) && j < len(other.values); {
+		if a.values[i] < a.values[j] {
+			i++
+			continue
+		}
+		if a.values[j] < a.values[i] {
+			j++
+			continue
+		}
+		return true
+	}
+	return false
+}
+
+func (a arrayReadOnlyContainer) intersectsAnyBitmap(other bitmapReadOnlyContainer) bool {
+	for _, value := range a.values {
+		if other.contains(value) {
+			return true
+		}
+	}
+	return false
+}
+
+func (a arrayReadOnlyContainer) intersectsAnyRuns(other runReadOnlyContainer) bool {
+	for _, value := range a.values {
+		if other.contains(value) {
+			return true
+		}
+	}
+	return false
+}
+
 type runReadOnlyContainer struct {
 	values []interval16
 }
@@ -190,6 +283,35 @@ func (r runReadOnlyContainer) contains(v uint16) bool {
 	return idx < n && v >= r.values[idx].start && v <= r.values[idx].last
 }
 
+func (r runReadOnlyContainer) intersectsAnyRuns(other runReadOnlyContainer) bool {
+	for i, j := 0, 0; i < len(r.values) && j < len(other.values); {
+		va, vb := r.values[i], other.values[j]
+		if va.last < vb.start {
+			i++
+		} else if va.start > vb.last {
+			j++
+		} else if va.last > vb.last && va.start >= vb.start {
+			return true
+		} else if va.last > vb.last && va.start < vb.start {
+			return true
+		} else if va.last <= vb.last && va.start >= vb.start {
+			return true
+		} else if va.last <= vb.last && va.start < vb.start {
+			return true
+		}
+	}
+	return false
+}
+
+func (r runReadOnlyContainer) intersectsAnyBitmap(other bitmapReadOnlyContainer) bool {
+	for _, value := range r.values {
+		if other.containsAnyRange(int32(value.start), int32(value.last)+1) {
+			return true
+		}
+	}
+	return false
+}
+
 func (c readOnlyContainer) validate() error {
 	switch c.containerType {
 	case containerBitmap:
@@ -375,6 +497,95 @@ func (b *ReadOnlyBitmap) Equal(other postings.List) bool {
 	return postings.Equal(b, other)
 }
 
+// IntersectsAny checks whether other bitmap intersects any values in this one.
+func (b *ReadOnlyBitmap) IntersectsAny(other *ReadOnlyBitmap) bool {
+	if b.keyN < 1 || other.keyN < 1 {
+		return false
+	}
+	for i, j := uint64(0), uint64(0); i < b.keyN && j < other.keyN; {
+		ki, kj := b.keyAtIndex(int(i)), other.keyAtIndex(int(j))
+		if ki < kj {
+			i++
+			continue
+		}
+		if kj < ki {
+			j++
+			continue
+		}
+
+		// Same key.
+		ci, _ := b.containerAtIndex(i)
+		cj, _ := other.containerAtIndex(j)
+		switch ci.containerType {
+		case containerArray:
+			left, _ := ci.array()
+
+			switch cj.containerType {
+			case containerArray:
+				right, _ := cj.array()
+				if left.intersectsAnyArray(right) {
+					return true
+				}
+			case containerBitmap:
+				right, _ := cj.bitmap()
+				if left.intersectsAnyBitmap(right) {
+					return true
+				}
+			case containerRun:
+				right, _ := cj.runs()
+				if left.intersectsAnyRuns(right) {
+					return true
+				}
+			}
+		case containerBitmap:
+			left, _ := ci.bitmap()
+
+			switch cj.containerType {
+			case containerArray:
+				right, _ := cj.array()
+				if right.intersectsAnyBitmap(left) {
+					return true
+				}
+			case containerBitmap:
+				right, _ := cj.bitmap()
+				if left.intersectsAnyBitmap(right) {
+					return true
+				}
+			case containerRun:
+				right, _ := cj.runs()
+				if right.intersectsAnyBitmap(left) {
+					return true
+				}
+			}
+		case containerRun:
+			left, _ := ci.runs()
+
+			switch cj.containerType {
+			case containerArray:
+				right, _ := cj.array()
+				if right.intersectsAnyRuns(left) {
+					return true
+				}
+			case containerBitmap:
+				right, _ := cj.bitmap()
+				if left.intersectsAnyBitmap(right) {
+					return true
+				}
+			case containerRun:
+				right, _ := cj.runs()
+				if left.intersectsAnyRuns(right) {
+					return true
+				}
+			}
+		}
+
+		i++
+		j++
+	}
+
+	return false
+}
+
 func (b *ReadOnlyBitmap) keyAtIndex(index int) uint64 {
 	meta := b.data[int(headerBaseSize)+index*12:]
 	return binary.LittleEndian.Uint64(meta[0:8])

From 4dbe5936b42ad1b5631a4ee1031a2ae0c13cf821 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 8 Dec 2020 18:54:16 -0500
Subject: [PATCH 049/106] Remove field not present for storage options

---
 src/dbnode/storage/options.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/dbnode/storage/options.go b/src/dbnode/storage/options.go
index ac5b4b89c7..8d9ab09c98 100644
--- a/src/dbnode/storage/options.go
+++ b/src/dbnode/storage/options.go
@@ -254,7 +254,6 @@ func newOptions(poolOpts pool.ObjectPoolOptions) Options {
 		wideBatchSize:                   defaultWideBatchSize,
 		namespaceHooks:                  &noopNamespaceHooks{},
 		tileAggregator:                  &noopTileAggregator{},
-		forceColdWritesEnabled:          true,
 	}
 	return o.SetEncodingM3TSZPooled()
 }

From 4cb402943a8b52e4e2123e017bcd630781c71ee9 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 8 Dec 2020 19:37:20 -0500
Subject: [PATCH 050/106] Fix non readonly postings

---
 .../storage/index/fields_terms_iterator.go    | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index 61e0a2647f..6f12c31581 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -131,7 +131,9 @@ func (fti *fieldsAndTermsIter) Reset(
 
 	// Restore restrict by postings intersect check.
 	fti.restrictByPostings = restrictByPostings
-	fti.restrictByPostings.Reset(nil)
+	if index.MigrationReadOnlyPostings() {
+		fti.restrictByPostings.Reset(nil)
+	}
 
 	// Set per use fields.
 	fti.reader = reader
@@ -165,21 +167,30 @@ func (fti *fieldsAndTermsIter) Reset(
 	// Hold onto the postings bitmap to intersect against on a per term basis.
 	if index.MigrationReadOnlyPostings() {
 		// Copy into a single flat read only bitmap so that can do fast intersect.
-		var buff bytes.Buffer
-		bitmap := pilosaroaring.NewBitmap()
-		iter := pl.Iterator()
-		for iter.Next() {
-			bitmap.DirectAdd(uint64(iter.Current()))
+		var (
+			bitmap *pilosaroaring.Bitmap
+			buff   bytes.Buffer
+		)
+		if b, ok := roaring.BitmapFromPostingsList(pl); ok {
+			bitmap = b
+		} else {
+			bitmap = pilosaroaring.NewBitmap()
+
+			iter := pl.Iterator()
+			for iter.Next() {
+				bitmap.DirectAdd(uint64(iter.Current()))
+			}
+			if err := iter.Err(); err != nil {
+				return err
+			}
+			if err := iter.Close(); err != nil {
+				return err
+			}
 		}
+
 		if _, err := bitmap.WriteTo(&buff); err != nil {
 			return err
 		}
-		if err := iter.Err(); err != nil {
-			return err
-		}
-		if err := iter.Close(); err != nil {
-			return err
-		}
 
 		if err := fti.restrictByPostings.Reset(buff.Bytes()); err != nil {
 			return err

From 0b4db71276ec70f9a836e4a4468237c8b00e9190 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 8 Dec 2020 20:36:17 -0500
Subject: [PATCH 051/106] Fix non readonly postings

---
 src/dbnode/storage/index/fields_terms_iterator.go | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index 6f12c31581..e7717132e8 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -118,9 +118,6 @@ func (fti *fieldsAndTermsIter) Reset(
 	reader segment.Reader,
 	opts fieldsAndTermsIteratorOpts,
 ) error {
-	// Keep restrict by postings intersect check until completely closed.
-	restrictByPostings := fti.restrictByPostings
-
 	// Close per use items.
 	if multiErr := fti.closePerUse(); multiErr.FinalError() != nil {
 		return multiErr.FinalError()
@@ -129,12 +126,6 @@ func (fti *fieldsAndTermsIter) Reset(
 	// Zero state.
 	*fti = fieldsAndTermsIterZeroed
 
-	// Restore restrict by postings intersect check.
-	fti.restrictByPostings = restrictByPostings
-	if index.MigrationReadOnlyPostings() {
-		fti.restrictByPostings.Reset(nil)
-	}
-
 	// Set per use fields.
 	fti.reader = reader
 	fti.opts = opts
@@ -192,7 +183,8 @@ func (fti *fieldsAndTermsIter) Reset(
 			return err
 		}
 
-		if err := fti.restrictByPostings.Reset(buff.Bytes()); err != nil {
+		fti.restrictByPostings, err = roaring.NewReadOnlyBitmap(buff.Bytes())
+		if err != nil {
 			return err
 		}
 	} else {

From f668a8427d8159f12c048384a9d1e1f712eb75e0 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 9 Dec 2020 13:29:49 -0500
Subject: [PATCH 052/106] Fix intersect array

---
 src/dbnode/storage/index/fields_terms_iterator.go | 12 +-----------
 src/m3ninx/postings/roaring/bitmap_readonly.go    |  4 ++--
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/src/dbnode/storage/index/fields_terms_iterator.go b/src/dbnode/storage/index/fields_terms_iterator.go
index e7717132e8..af262c5b66 100644
--- a/src/dbnode/storage/index/fields_terms_iterator.go
+++ b/src/dbnode/storage/index/fields_terms_iterator.go
@@ -96,17 +96,7 @@ func newFieldsAndTermsIterator(
 	reader segment.Reader,
 	opts fieldsAndTermsIteratorOpts,
 ) (fieldsAndTermsIterator, error) {
-	var restrictByPostings *roaring.ReadOnlyBitmap
-	if index.MigrationReadOnlyPostings() {
-		var err error
-		restrictByPostings, err = roaring.NewReadOnlyBitmap(nil)
-		if err != nil {
-			return nil, err
-		}
-	}
-	iter := &fieldsAndTermsIter{
-		restrictByPostings: restrictByPostings,
-	}
+	iter := &fieldsAndTermsIter{}
 	err := iter.Reset(reader, opts)
 	if err != nil {
 		return nil, err
diff --git a/src/m3ninx/postings/roaring/bitmap_readonly.go b/src/m3ninx/postings/roaring/bitmap_readonly.go
index 695bb586da..dc19ce829c 100644
--- a/src/m3ninx/postings/roaring/bitmap_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_readonly.go
@@ -240,11 +240,11 @@ func (a arrayReadOnlyContainer) contains(v uint16) bool {
 
 func (a arrayReadOnlyContainer) intersectsAnyArray(other arrayReadOnlyContainer) bool {
 	for i, j := 0, 0; i < len(a.values) && j < len(other.values); {
-		if a.values[i] < a.values[j] {
+		if a.values[i] < other.values[j] {
 			i++
 			continue
 		}
-		if a.values[j] < a.values[i] {
+		if other.values[j] < a.values[i] {
 			j++
 			continue
 		}

From a8eaf9ad7610f3354eb8dabfe7a66d6a743592a2 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 14 Jan 2021 17:47:58 -0500
Subject: [PATCH 053/106] Add in-memory block index block to replace per time
 window block, phase series out after index blocks sealed

---
 src/dbnode/storage/index.go                   |  56 +++-
 src/dbnode/storage/index/block.go             |  19 ++
 .../storage/index/compaction/compactor.go     |   3 +
 src/dbnode/storage/index/mutable_segments.go  | 260 ++++++++++++++++--
 src/dbnode/storage/index/types.go             |   7 +
 .../segment/builder/multi_segments_builder.go |  37 ++-
 ...i_segments_multi_key_postings_list_iter.go |  20 +-
 .../builder/multi_segments_terms_iter.go      |  18 +-
 src/m3ninx/index/segment/types.go             |   5 +
 9 files changed, 376 insertions(+), 49 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index bfb5400e36..26601e85d9 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -139,6 +139,8 @@ type nsIndex struct {
 
 	doNotIndexWithFields []doc.Field
 	shardSet             sharding.ShardSet
+
+	inMemoryBlock index.Block
 }
 
 type nsIndexState struct {
@@ -408,6 +410,15 @@ func newNamespaceIndexWithOptions(
 		return nil, err
 	}
 
+	futureBlock := nowFn().Add(10 * 365 * 24 * time.Hour)
+	inMemBlock, err := idx.newBlockFn(futureBlock, idx.nsMetadata,
+		index.BlockOptions{InMemoryBlock: true}, idx.namespaceRuntimeOptsMgr, idx.opts.IndexOptions())
+	if err != nil {
+		return nil, err
+	}
+
+	idx.inMemoryBlock = inMemBlock
+
 	// Report stats
 	go idx.reportStatsUntilClosed()
 
@@ -796,12 +807,19 @@ func (i *nsIndex) writeBatchForBlockStart(
 		return
 	}
 
+	block := blockResult.block
+	latest := blockResult.latest
+	if block.IsOpen() {
+		// Write to in memory block if this block is open.
+		block = i.inMemoryBlock
+	}
+
 	// Track attempted write.
 	// Note: attemptTotal should = attemptSkip + attemptWrite.
 	i.metrics.asyncInsertAttemptWrite.Inc(int64(numPending))
 
 	// i.e. we have the block and the inserts, perform the writes.
-	result, err := blockResult.block.WriteBatch(batch)
+	result, err := block.WriteBatch(batch)
 
 	// Record the end to end indexing latency.
 	now := i.nowFn()
@@ -817,7 +835,7 @@ func (i *nsIndex) writeBatchForBlockStart(
 	}
 
 	// Record mutable segments count foreground/background if latest block.
-	if stats := result.MutableSegmentsStats; !stats.Empty() && blockResult.latest {
+	if stats := result.MutableSegmentsStats; !stats.Empty() && latest {
 		i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.NumForeground))
 		i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.NumBackground))
 	}
@@ -891,7 +909,10 @@ func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceInd
 
 	result.NumBlocks = int64(len(i.state.blocksByTime))
 
-	var multiErr xerrors.MultiError
+	var (
+		multiErr     xerrors.MultiError
+		sealedBlocks = make([]xtime.UnixNano, 0, len(i.state.blocksByTime))
+	)
 	for blockStart, block := range i.state.blocksByTime {
 		if c.IsCancelled() {
 			multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation)
@@ -921,8 +942,24 @@ func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceInd
 			multiErr = multiErr.Add(block.Seal())
 			result.NumBlocksSealed++
 		}
+
+		if block.IsSealed() {
+			sealedBlocks = append(sealedBlocks, blockStart)
+		}
 	}
 
+	block := i.inMemoryBlock
+	blockTickResult, tickErr := block.Tick(c)
+	multiErr = multiErr.Add(tickErr)
+	result.NumSegments += blockTickResult.NumSegments
+	result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
+	result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
+	result.NumTotalDocs += blockTickResult.NumDocs
+	result.FreeMmap += blockTickResult.FreeMmap
+
+	// Notify in memory block of sealed blocks.
+	multiErr = multiErr.Add(block.InMemoryBlockNotifySealedBlocks(sealedBlocks))
+
 	return result, multiErr.FinalError()
 }
 
@@ -1942,9 +1979,10 @@ func (i *nsIndex) updateBlockStartsWithLock() {
 		latestBlock      index.Block
 	)
 
-	blocks := make([]blockAndBlockStart, 0, len(i.state.blocksByTime))
+	blocks := make([]blockAndBlockStart, 0, len(i.state.blocksByTime)+1)
 	for ts, block := range i.state.blocksByTime {
 		if ts >= latestBlockStart {
+			latestBlockStart = ts
 			latestBlock = block
 		}
 		blocks = append(blocks, blockAndBlockStart{
@@ -1953,10 +1991,16 @@ func (i *nsIndex) updateBlockStartsWithLock() {
 		})
 	}
 
+	blocks = append(blocks, blockAndBlockStart{
+		block:      i.inMemoryBlock,
+		blockStart: xtime.ToUnixNano(i.inMemoryBlock.StartTime()),
+	})
+
 	// order in desc order (i.e. reverse chronological)
 	sort.Slice(blocks, func(i, j int) bool {
 		return blocks[i].blockStart > blocks[j].blockStart
 	})
+
 	// NB(r): Important not to modify this once set since we take reference
 	// to this slice with an RLock, release with RUnlock and then loop over it
 	// during query time so it must not be altered and stay immutable.
@@ -2069,7 +2113,7 @@ func (i *nsIndex) CleanupDuplicateFileSets() error {
 
 func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error {
 	i.state.RLock()
-	defer i.state.RLock()
+	defer i.state.RUnlock()
 	if i.state.closed {
 		return errDbIndexAlreadyClosed
 	}
@@ -2174,6 +2218,8 @@ func (i *nsIndex) Close() error {
 		multiErr = multiErr.Add(block.Close())
 	}
 
+	multiErr = multiErr.Add(i.inMemoryBlock.Close())
+
 	return multiErr.FinalError()
 }
 
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index 833e82c590..8f83944b63 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -195,6 +195,7 @@ type blockShardRangesSegments struct {
 type BlockOptions struct {
 	ForegroundCompactorMmapDocsData bool
 	BackgroundCompactorMmapDocsData bool
+	InMemoryBlock                   bool
 }
 
 // NewBlockFn is a new block constructor.
@@ -223,6 +224,7 @@ func NewBlock(
 	scope := iopts.MetricsScope().SubScope("index").SubScope("block")
 	iopts = iopts.SetMetricsScope(scope)
 	segs := newMutableSegments(
+		md,
 		blockStart,
 		opts,
 		blockOpts,
@@ -233,6 +235,7 @@ func NewBlock(
 	// NB(bodu): The length of coldMutableSegments is always at least 1.
 	coldSegs := []*mutableSegments{
 		newMutableSegments(
+			md,
 			blockStart,
 			opts,
 			blockOpts,
@@ -264,6 +267,15 @@ func NewBlock(
 	return b, nil
 }
 
+func (b *block) InMemoryBlockNotifySealedBlocks(
+	sealed []xtime.UnixNano,
+) error {
+	if !b.blockOpts.InMemoryBlock {
+		return fmt.Errorf("block not in-memory block: start=%v", b.StartTime())
+	}
+	return b.mutableSegments.NotifySealedBlocks(sealed)
+}
+
 func (b *block) StartTime() time.Time {
 	return b.blockStart
 }
@@ -1120,6 +1132,12 @@ func (b *block) Stats(reporter BlockStatsReporter) error {
 	return nil
 }
 
+func (b *block) IsOpen() bool {
+	b.RLock()
+	defer b.RUnlock()
+	return b.state == blockStateOpen
+}
+
 func (b *block) IsSealedWithRLock() bool {
 	return b.state == blockStateSealed
 }
@@ -1212,6 +1230,7 @@ func (b *block) RotateColdMutableSegments() {
 	b.Lock()
 	defer b.Unlock()
 	b.coldMutableSegments = append(b.coldMutableSegments, newMutableSegments(
+		b.nsMD,
 		b.blockStart,
 		b.opts,
 		b.blockOpts,
diff --git a/src/dbnode/storage/index/compaction/compactor.go b/src/dbnode/storage/index/compaction/compactor.go
index 56408ee4df..e70e1cede8 100644
--- a/src/dbnode/storage/index/compaction/compactor.go
+++ b/src/dbnode/storage/index/compaction/compactor.go
@@ -23,6 +23,7 @@ package compaction
 import (
 	"bytes"
 	"errors"
+	"github.com/m3db/bloom/v4"
 	"io"
 	"sync"
 
@@ -105,6 +106,7 @@ func NewCompactor(
 // time.
 func (c *Compactor) Compact(
 	segs []segment.Segment,
+	filter *bloom.ReadOnlyBloomFilter,
 	reporterOptions mmap.ReporterOptions,
 ) (fst.Segment, error) {
 	c.Lock()
@@ -115,6 +117,7 @@ func (c *Compactor) Compact(
 	}
 
 	c.builder.Reset()
+	c.builder.SetFilter(filter)
 	if err := c.builder.AddSegments(segs); err != nil {
 		return nil, err
 	}
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index ee11e9bdbe..bf38515a86 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -21,6 +21,7 @@
 package index
 
 import (
+	"bytes"
 	"errors"
 	"fmt"
 	"math"
@@ -28,6 +29,7 @@ import (
 	"sync"
 	"time"
 
+	"github.com/m3db/bloom/v4"
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
 	"github.com/m3db/m3/src/dbnode/storage/index/segments"
@@ -36,9 +38,11 @@ import (
 	"github.com/m3db/m3/src/m3ninx/index/segment/builder"
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst"
 	"github.com/m3db/m3/src/x/context"
+	xerrors "github.com/m3db/m3/src/x/errors"
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	xresource "github.com/m3db/m3/src/x/resource"
+	xtime "github.com/m3db/m3/src/x/time"
 
 	"github.com/uber-go/tally"
 	"go.uber.org/zap"
@@ -65,21 +69,101 @@ type mutableSegments struct {
 
 	state mutableSegmentsState
 
-	foregroundSegments []*readableSeg
-	backgroundSegments []*readableSeg
+	foregroundSegments               []*readableSeg
+	backgroundSegments               []*readableSeg
+	backgroundCompactIndexedSnapshot *indexedBloomFilterSnapshot
 
 	compact                  mutableSegmentsCompact
 	blockStart               time.Time
+	blockSize                time.Duration
 	blockOpts                BlockOptions
 	opts                     Options
 	iopts                    instrument.Options
 	optsListener             xresource.SimpleCloser
 	writeIndexingConcurrency int
 
+	indexedBloomFilterByTimeLock sync.RWMutex
+	indexedBloomFilterByTime     map[xtime.UnixNano]*indexedBloomFilter
+
 	metrics mutableSegmentsMetrics
 	logger  *zap.Logger
 }
 
+type indexedBloomFilter struct {
+	writes        *bloom.BloomFilter
+	snapshotDirty bool
+	snapshot      *bytes.Buffer
+}
+
+var (
+	// Estimate bloom values for 1million and 1% false positive rate.
+	// Roughly 1mb size with k:7 (hash 7 times on insert/lookup).
+	bloomM, bloomK = bloom.EstimateFalsePositiveRate(1<<20, 0.01)
+)
+
+func newIndexedBloomFilter() *indexedBloomFilter {
+	bf := bloom.NewBloomFilter(bloomM, bloomK)
+	snapshot := bytes.NewBuffer(nil)
+	_ = bf.BitSet().Write(snapshot)
+	return &indexedBloomFilter{
+		writes:   bf,
+		snapshot: snapshot,
+	}
+}
+
+func (f *indexedBloomFilter) Write(id []byte) {
+	f.writes.Add(id)
+	f.snapshotDirty = true
+}
+
+func (f *indexedBloomFilter) UpdateSnapshotIfRequired() {
+	if !f.snapshotDirty {
+		return
+	}
+	f.snapshot.Truncate(0)
+	_ = f.writes.BitSet().Write(f.snapshot)
+	f.snapshotDirty = false
+}
+
+type indexedBloomFilterSnapshot struct {
+	buffer      []byte
+	bloomFilter *bloom.ReadOnlyBloomFilter
+}
+
+func newIndexedBloomFilterSnapshot() *indexedBloomFilterSnapshot {
+	return &indexedBloomFilterSnapshot{}
+}
+
+func (s *indexedBloomFilterSnapshot) Reset() {
+	for i := range s.buffer {
+		s.buffer[i] = 0
+	}
+}
+
+func (s *indexedBloomFilterSnapshot) ReadOnlyBloomFilter() *bloom.ReadOnlyBloomFilter {
+	// In future would be good to update read only bloom filter instead
+	// of having to create a new one with the buffer (even though it's just
+	// a wrapper over the buffer.).
+	return bloom.NewReadOnlyBloomFilter(bloomM, bloomK, s.buffer)
+}
+
+func (f *indexedBloomFilter) MergeSnapshot(
+	snap *indexedBloomFilterSnapshot,
+) {
+	data := f.snapshot.Bytes()
+	size := len(data)
+	if cap(snap.buffer) < size {
+		// Grow buffer if required.
+		snap.buffer = make([]byte, size)
+	} else {
+		snap.buffer = snap.buffer[:size]
+	}
+
+	for i := range snap.buffer {
+		snap.buffer[i] |= data[i]
+	}
+}
+
 type mutableSegmentsMetrics struct {
 	foregroundCompactionPlanRunLatency tally.Timer
 	foregroundCompactionTaskRunLatency tally.Timer
@@ -98,9 +182,10 @@ func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
 	}
 }
 
-// NewBlock returns a new Block, representing a complete reverse index for the
-// duration of time specified. It is backed by one or more segments.
+// newMutableSegments returns a new Block, representing a complete reverse index
+// for the duration of time specified. It is backed by one or more segments.
 func newMutableSegments(
+	md namespace.Metadata,
 	blockStart time.Time,
 	opts Options,
 	blockOpts BlockOptions,
@@ -108,17 +193,44 @@ func newMutableSegments(
 	iopts instrument.Options,
 ) *mutableSegments {
 	m := &mutableSegments{
-		blockStart: blockStart,
-		opts:       opts,
-		blockOpts:  blockOpts,
-		iopts:      iopts,
-		metrics:    newMutableSegmentsMetrics(iopts.MetricsScope()),
-		logger:     iopts.Logger(),
+		blockStart:               blockStart,
+		blockSize:                md.Options().IndexOptions().BlockSize(),
+		opts:                     opts,
+		blockOpts:                blockOpts,
+		iopts:                    iopts,
+		indexedBloomFilterByTime: make(map[xtime.UnixNano]*indexedBloomFilter),
+		metrics:                  newMutableSegmentsMetrics(iopts.MetricsScope()),
+		logger:                   iopts.Logger(),
 	}
 	m.optsListener = namespaceRuntimeOptsMgr.RegisterListener(m)
 	return m
 }
 
+func (m *mutableSegments) NotifySealedBlocks(
+	sealed []xtime.UnixNano,
+) error {
+	if len(sealed) == 0 {
+		return nil
+	}
+
+	m.indexedBloomFilterByTimeLock.Lock()
+	for _, blockStart := range sealed {
+		_, exists := m.indexedBloomFilterByTime[blockStart]
+		if !exists {
+			continue
+		}
+		// Remove indexed set if block now sealed.
+		delete(m.indexedBloomFilterByTime, blockStart)
+	}
+	m.indexedBloomFilterByTimeLock.Unlock()
+
+	m.Lock()
+	m.maybeBackgroundCompactWithLock()
+	m.Unlock()
+
+	return nil
+}
+
 func (m *mutableSegments) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptions) {
 	m.Lock()
 	// Update current runtime opts for segment builders created in future.
@@ -144,6 +256,7 @@ func (m *mutableSegments) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptio
 func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats, error) {
 	m.Lock()
 	if m.state == mutableSegmentsStateClosed {
+		m.Unlock()
 		return MutableSegmentsStats{}, errMutableSegmentsAlreadyClosed
 	}
 
@@ -164,6 +277,37 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	builder := m.compact.segmentBuilder
 	m.Unlock()
 
+	// Kick off updating indexedBloomFilterByTime if needed
+	var updateIndexedBloomFilters sync.WaitGroup
+	if m.blockOpts.InMemoryBlock {
+		updateIndexedBloomFilters.Add(1)
+		go func() {
+			m.indexedBloomFilterByTimeLock.Lock()
+			defer func() {
+				updateIndexedBloomFilters.Done()
+				m.indexedBloomFilterByTimeLock.Unlock()
+			}()
+
+			// Update bloom filters.
+			entries := inserts.PendingEntries()
+			docs := inserts.PendingDocs()
+			for i := range entries {
+				blockStart := entries[i].indexBlockStart(m.blockSize)
+				bloomFilter, ok := m.indexedBloomFilterByTime[blockStart]
+				if !ok {
+					bloomFilter = newIndexedBloomFilter()
+					m.indexedBloomFilterByTime[blockStart] = bloomFilter
+				}
+				bloomFilter.writes.Add(docs[i].ID)
+			}
+
+			// Update snapshots.
+			for _, bloomFilter := range m.indexedBloomFilterByTime {
+				bloomFilter.UpdateSnapshotIfRequired()
+			}
+		}()
+	}
+
 	defer func() {
 		m.Lock()
 		m.compact.compactingForeground = false
@@ -189,6 +333,11 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 		return MutableSegmentsStats{}, err
 	}
 
+	if m.blockOpts.InMemoryBlock {
+		// Wait for bloom filters to be updated.
+		updateIndexedBloomFilters.Wait()
+	}
+
 	// Return result from the original insertion since compaction was successful.
 	return MutableSegmentsStats{
 		NumForeground: result.numForeground,
@@ -343,6 +492,18 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		})
 	}
 
+	m.indexedBloomFilterByTimeLock.RLock()
+	for _, bloomFilter := range m.indexedBloomFilterByTime {
+		if m.backgroundCompactIndexedSnapshot == nil {
+			m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
+		}
+		m.backgroundCompactIndexedSnapshot.Reset()
+		bloomFilter.MergeSnapshot(m.backgroundCompactIndexedSnapshot)
+	}
+	m.indexedBloomFilterByTimeLock.RUnlock()
+
+	indexedAndActiveBloomFilter := m.backgroundCompactIndexedSnapshot.ReadOnlyBloomFilter()
+
 	plan, err := compaction.NewPlan(segs, m.opts.BackgroundCompactionPlannerOptions())
 	if err != nil {
 		instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
@@ -351,6 +512,57 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		return
 	}
 
+	if len(plan.Tasks) == 0 {
+		// Check if any segments needs filtering.
+		for _, seg := range m.backgroundSegments {
+			reader, err := seg.Segment().Reader()
+			if err != nil {
+				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+					l.Error("index background compaction plan reader error", zap.Error(err))
+				})
+				return
+			}
+
+			iter, err := reader.AllDocs()
+			if err != nil {
+				_ = reader.Close()
+				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+					l.Error("index background compaction plan iter start error", zap.Error(err))
+				})
+				return
+			}
+
+			for iter.Next() {
+				d := iter.Current()
+				if !indexedAndActiveBloomFilter.Test(d.ID) {
+					// This metric is not active, likely part of a block
+					// time window that is now sealed.
+					// We need to purge it to remove memory.
+					plan.Tasks = append(plan.Tasks, compaction.Task{
+						Segments: []compaction.Segment{
+							{
+								Age:     seg.Age(),
+								Size:    seg.Segment().Size(),
+								Type:    segments.FSTType,
+								Segment: seg.Segment(),
+							},
+						},
+					})
+					break
+				}
+			}
+			if err := xerrors.FirstError(iter.Err(), iter.Close()); err != nil {
+				_ = reader.Close()
+				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+					l.Error("index background compaction plan iter done error", zap.Error(err))
+				})
+				return
+			}
+
+			_ = reader.Close()
+		}
+	}
+
 	if len(plan.Tasks) == 0 {
 		return
 	}
@@ -358,7 +570,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 	// Kick off compaction.
 	m.compact.compactingBackground = true
 	go func() {
-		m.backgroundCompactWithPlan(plan)
+		m.backgroundCompactWithPlan(plan, indexedAndActiveBloomFilter)
 
 		m.Lock()
 		m.compact.compactingBackground = false
@@ -412,7 +624,10 @@ func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg
 	}
 }
 
-func (m *mutableSegments) backgroundCompactWithPlan(plan *compaction.Plan) {
+func (m *mutableSegments) backgroundCompactWithPlan(
+	plan *compaction.Plan,
+	activeBloomFilter *bloom.ReadOnlyBloomFilter,
+) {
 	sw := m.metrics.backgroundCompactionPlanRunLatency.Start()
 	defer sw.Stop()
 
@@ -438,7 +653,7 @@ func (m *mutableSegments) backgroundCompactWithPlan(plan *compaction.Plan) {
 	}
 
 	for i, task := range plan.Tasks {
-		err := m.backgroundCompactWithTask(task, log,
+		err := m.backgroundCompactWithTask(task, activeBloomFilter, log,
 			logger.With(zap.Int("task", i)))
 		if err != nil {
 			instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
@@ -462,6 +677,7 @@ func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment
 
 func (m *mutableSegments) backgroundCompactWithTask(
 	task compaction.Task,
+	activeBloomFilter *bloom.ReadOnlyBloomFilter,
 	log bool,
 	logger *zap.Logger,
 ) error {
@@ -475,12 +691,14 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	}
 
 	start := time.Now()
-	compacted, err := m.compact.backgroundCompactor.Compact(segments, mmap.ReporterOptions{
-		Context: mmap.Context{
-			Name: mmapIndexBlockName,
-		},
-		Reporter: m.opts.MmapReporter(),
-	})
+	compacted, err := m.compact.backgroundCompactor.Compact(segments,
+		activeBloomFilter,
+		mmap.ReporterOptions{
+			Context: mmap.Context{
+				Name: mmapIndexBlockName,
+			},
+			Reporter: m.opts.MmapReporter(),
+		})
 	took := time.Since(start)
 	m.metrics.backgroundCompactionTaskRunLatency.Record(took)
 
@@ -542,7 +760,9 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
 	return append(result, newReadableSeg(compacted, m.opts))
 }
 
-func (m *mutableSegments) foregroundCompactWithBuilder(builder segment.DocumentsBuilder) (compactResult, error) {
+func (m *mutableSegments) foregroundCompactWithBuilder(
+	builder segment.DocumentsBuilder,
+) (compactResult, error) {
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment.
 	m.Lock()
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index b8962d9068..fb73b85add 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -350,6 +350,10 @@ type OnIndexSeries interface {
 // Block represents a collection of segments. Each `Block` is a complete reverse
 // index for a period of time defined by [StartTime, EndTime).
 type Block interface {
+	// InMemoryBlockNotifySealedBlocks notifies an in memory block of
+	// sealed blocks.
+	InMemoryBlockNotifySealedBlocks(sealed []xtime.UnixNano) error
+
 	// StartTime returns the start time of the period this Block indexes.
 	StartTime() time.Time
 
@@ -389,6 +393,9 @@ type Block interface {
 	// Stats returns block stats.
 	Stats(reporter BlockStatsReporter) error
 
+	// IsOpen returns true if open and not sealed yet.
+	IsOpen() bool
+
 	// Seal prevents the block from taking any more writes, but, it still permits
 	// addition of segments via Bootstrap().
 	Seal() error
diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go
index e9779b79dd..21d62b4085 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_builder.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go
@@ -24,6 +24,7 @@ import (
 	"io"
 	"sort"
 
+	"github.com/m3db/bloom/v4"
 	"github.com/m3db/m3/src/m3ninx/doc"
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment"
@@ -34,6 +35,7 @@ import (
 type builderFromSegments struct {
 	docs           []doc.Document
 	idSet          *IDsMap
+	filter         *bloom.ReadOnlyBloomFilter
 	segments       []segmentMetadata
 	termsIter      *termsIterFromSegments
 	segmentsOffset postings.ID
@@ -42,11 +44,12 @@ type builderFromSegments struct {
 type segmentMetadata struct {
 	segment segment.Segment
 	offset  postings.ID
-	// duplicatesAsc is a lookup of document IDs are duplicates
-	// in this segment, that is documents that are already
-	// contained by other segments and hence should not be
+	// skipAsc is a lookup of document IDs are duplicates or
+	// to filter out in this segment, that is documents that are already
+	// contained by other segments or should not be included
+	// in the output segment and hence should not be
 	// returned when looking up documents.
-	duplicatesAsc []postings.ID
+	skipAsc []postings.ID
 }
 
 // NewBuilderFromSegments returns a new builder from segments.
@@ -81,6 +84,10 @@ func (b *builderFromSegments) Reset() {
 	b.termsIter.clear()
 }
 
+func (b *builderFromSegments) SetFilter(filter *bloom.ReadOnlyBloomFilter) {
+	b.filter = filter
+}
+
 func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 	// numMaxDocs can sometimes be larger than the actual number of documents
 	// since some are duplicates
@@ -103,13 +110,19 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 		}
 
 		var (
-			added      int
-			duplicates []postings.ID
+			added int
+			skip  []postings.ID
 		)
 		for iter.Next() {
 			d := iter.Current()
 			if b.idSet.Contains(d.ID) {
-				duplicates = append(duplicates, iter.PostingsID())
+				// Skip duplicates.
+				skip = append(skip, iter.PostingsID())
+				continue
+			}
+			if b.filter != nil && !b.filter.Test(d.ID) {
+				// Actively filtering and ID is not contained.
+				skip = append(skip, iter.PostingsID())
 				continue
 			}
 			b.idSet.SetUnsafe(d.ID, struct{}{}, IDsMapSetUnsafeOptions{
@@ -126,14 +139,14 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 		}
 
 		// Sort duplicates in ascending order
-		sort.Slice(duplicates, func(i, j int) bool {
-			return duplicates[i] < duplicates[j]
+		sort.Slice(skip, func(i, j int) bool {
+			return skip[i] < skip[j]
 		})
 
 		b.segments = append(b.segments, segmentMetadata{
-			segment:       segment,
-			offset:        b.segmentsOffset,
-			duplicatesAsc: duplicates,
+			segment: segment,
+			offset:  b.segmentsOffset,
+			skipAsc: skip,
 		})
 		b.segmentsOffset += postings.ID(added)
 	}
diff --git a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
index 3d3330a554..d04dc27e78 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
@@ -147,9 +147,10 @@ func (i *multiKeyPostingsListIterator) Next() bool {
 			return false
 		}
 
-		if fieldsKeyIter.segment.offset == 0 {
+		if fieldsKeyIter.segment.offset == 0 && len(fieldsKeyIter.segment.skipAsc) == 0 {
 			// No offset, which means is first segment we are combining from
 			// so can just direct union.
+			// Make sure skipAsc is empty otherwise we need to do filtering.
 			if index.MigrationReadOnlyPostings() {
 				if err := i.currFieldPostingsList.AddIterator(pl.Iterator()); err != nil {
 					i.err = err
@@ -167,17 +168,17 @@ func (i *multiKeyPostingsListIterator) Next() bool {
 		// We have to taken into account the offset and duplicates
 		var (
 			iter           = pl.Iterator()
-			duplicates     = fieldsKeyIter.segment.duplicatesAsc
+			skip           = fieldsKeyIter.segment.skipAsc
 			negativeOffset postings.ID
 		)
 		for iter.Next() {
 			curr := iter.Current()
-			for len(duplicates) > 0 && curr > duplicates[0] {
-				duplicates = duplicates[1:]
+			for len(skip) > 0 && curr > skip[0] {
+				skip = skip[1:]
 				negativeOffset++
 			}
-			if len(duplicates) > 0 && curr == duplicates[0] {
-				duplicates = duplicates[1:]
+			if len(skip) > 0 && curr == skip[0] {
+				skip = skip[1:]
 				negativeOffset++
 				// Also skip this value, as itself is a duplicate
 				continue
@@ -197,6 +198,13 @@ func (i *multiKeyPostingsListIterator) Next() bool {
 			return false
 		}
 	}
+
+	if i.currFieldPostingsList.IsEmpty() {
+		// Everything skipped or term is empty.
+		// TODO: make this non-stack based (i.e. not recursive).
+		return i.Next()
+	}
+
 	return true
 }
 
diff --git a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
index 225bc3b732..5c40542ab7 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
@@ -136,7 +136,7 @@ func (i *termsIterFromSegments) Next() bool {
 		termsKeyIter := iter.(*termsKeyIter)
 		_, list := termsKeyIter.iter.Current()
 
-		if termsKeyIter.segment.offset == 0 {
+		if termsKeyIter.segment.offset == 0 && len(termsKeyIter.segment.skipAsc) == 0 {
 			// No offset, which means is first segment we are combining from
 			// so can just direct union.
 			if index.MigrationReadOnlyPostings() {
@@ -156,17 +156,17 @@ func (i *termsIterFromSegments) Next() bool {
 		// We have to taken into account the offset and duplicates
 		var (
 			iter           = list.Iterator()
-			duplicates     = termsKeyIter.segment.duplicatesAsc
+			skip           = termsKeyIter.segment.skipAsc
 			negativeOffset postings.ID
 		)
 		for iter.Next() {
 			curr := iter.Current()
-			for len(duplicates) > 0 && curr > duplicates[0] {
-				duplicates = duplicates[1:]
+			for len(skip) > 0 && curr > skip[0] {
+				skip = skip[1:]
 				negativeOffset++
 			}
-			if len(duplicates) > 0 && curr == duplicates[0] {
-				duplicates = duplicates[1:]
+			if len(skip) > 0 && curr == skip[0] {
+				skip = skip[1:]
 				negativeOffset++
 				// Also skip this value, as itself is a duplicate
 				continue
@@ -187,6 +187,12 @@ func (i *termsIterFromSegments) Next() bool {
 		}
 	}
 
+	if i.currPostingsList.IsEmpty() {
+		// Everything skipped or term is empty.
+		// TODO: make this non-stack based (i.e. not recursive).
+		return i.Next()
+	}
+
 	return true
 }
 
diff --git a/src/m3ninx/index/segment/types.go b/src/m3ninx/index/segment/types.go
index 94ee167716..eb71747a7e 100644
--- a/src/m3ninx/index/segment/types.go
+++ b/src/m3ninx/index/segment/types.go
@@ -22,6 +22,7 @@ package segment
 
 import (
 	"errors"
+	"github.com/m3db/bloom/v4"
 
 	"github.com/m3db/m3/src/m3ninx/doc"
 	"github.com/m3db/m3/src/m3ninx/index"
@@ -220,6 +221,10 @@ type CloseableDocumentsBuilder interface {
 type SegmentsBuilder interface {
 	Builder
 
+	// SetFilter sets a filter on which documents to retain
+	// when building the segment.
+	SetFilter(bloom *bloom.ReadOnlyBloomFilter)
+
 	// AddSegments adds segments to build from.
 	AddSegments(segments []Segment) error
 }

From 78eaa8b83a5bb643aab9d6fbb77cb38bdeb9143e Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 15 Jan 2021 20:19:49 -0500
Subject: [PATCH 054/106] Create the in memory block earlier

---
 src/dbnode/storage/index.go | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 26601e85d9..52ef98d95c 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -365,6 +365,15 @@ func newNamespaceIndexWithOptions(
 		shardSet:             shardSet,
 	}
 
+	futureBlock := nowFn().Add(10 * 365 * 24 * time.Hour)
+	inMemBlock, err := idx.newBlockFn(futureBlock, idx.nsMetadata,
+		index.BlockOptions{InMemoryBlock: true}, idx.namespaceRuntimeOptsMgr, idx.opts.IndexOptions())
+	if err != nil {
+		return nil, err
+	}
+
+	idx.inMemoryBlock = inMemBlock
+
 	// Assign shard set upfront.
 	idx.AssignShardSet(shardSet)
 
@@ -410,15 +419,6 @@ func newNamespaceIndexWithOptions(
 		return nil, err
 	}
 
-	futureBlock := nowFn().Add(10 * 365 * 24 * time.Hour)
-	inMemBlock, err := idx.newBlockFn(futureBlock, idx.nsMetadata,
-		index.BlockOptions{InMemoryBlock: true}, idx.namespaceRuntimeOptsMgr, idx.opts.IndexOptions())
-	if err != nil {
-		return nil, err
-	}
-
-	idx.inMemoryBlock = inMemBlock
-
 	// Report stats
 	go idx.reportStatsUntilClosed()
 

From 5f93d0c6f158caf352c1a7e0300587ae792c87a1 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 19 Jan 2021 21:36:20 -0500
Subject: [PATCH 055/106] Avoid indexing already indexed timeseries

---
 .../storage/index/compaction/compactor.go     |  16 +-
 src/dbnode/storage/index/mutable_segments.go  | 158 +++++++++++-------
 src/dbnode/storage/index/types.go             |  19 ++-
 3 files changed, 123 insertions(+), 70 deletions(-)

diff --git a/src/dbnode/storage/index/compaction/compactor.go b/src/dbnode/storage/index/compaction/compactor.go
index e70e1cede8..7b27b0bc66 100644
--- a/src/dbnode/storage/index/compaction/compactor.go
+++ b/src/dbnode/storage/index/compaction/compactor.go
@@ -23,7 +23,6 @@ package compaction
 import (
 	"bytes"
 	"errors"
-	"github.com/m3db/bloom/v4"
 	"io"
 	"sync"
 
@@ -35,10 +34,15 @@ import (
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding/docs"
 	xerrors "github.com/m3db/m3/src/x/errors"
 	"github.com/m3db/m3/src/x/mmap"
+
+	"github.com/m3db/bloom/v4"
 )
 
 var (
-	errCompactorBuilderEmpty = errors.New("builder has no documents")
+	// ErrCompactorBuilderEmpty is returned when the compaction
+	// would result in an empty segment.
+	ErrCompactorBuilderEmpty = errors.New("builder has no documents")
+	errCompactorBuilderNil   = errors.New("builder is nil")
 	errCompactorClosed       = errors.New("compactor is closed")
 )
 
@@ -141,7 +145,7 @@ func (c *Compactor) CompactUsingBuilder(
 	}
 
 	if builder == nil {
-		return nil, errCompactorBuilderEmpty
+		return nil, errCompactorBuilderNil
 	}
 
 	if len(segs) == 0 {
@@ -151,9 +155,7 @@ func (c *Compactor) CompactUsingBuilder(
 
 	// Need to combine segments first
 	batch := c.docsPool.Get()
-	defer func() {
-		c.docsPool.Put(batch)
-	}()
+	defer c.docsPool.Put(batch)
 
 	// flushBatch is declared to reuse the same code from the
 	// inner loop and the completion of the loop
@@ -246,7 +248,7 @@ func (c *Compactor) compactFromBuilderWithLock(
 	// runs, we need to copy the docs slice
 	allDocs := builder.Docs()
 	if len(allDocs) == 0 {
-		return nil, errCompactorBuilderEmpty
+		return nil, ErrCompactorBuilderEmpty
 	}
 
 	err := c.writer.Reset(builder)
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index bf38515a86..91303e8eed 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -90,6 +90,7 @@ type mutableSegments struct {
 }
 
 type indexedBloomFilter struct {
+	doNotWrite    *builder.IDsMap
 	writes        *bloom.BloomFilter
 	snapshotDirty bool
 	snapshot      *bytes.Buffer
@@ -106,12 +107,23 @@ func newIndexedBloomFilter() *indexedBloomFilter {
 	snapshot := bytes.NewBuffer(nil)
 	_ = bf.BitSet().Write(snapshot)
 	return &indexedBloomFilter{
+		doNotWrite: builder.NewIDsMap(builder.IDsMapOptions{
+			InitialSize: 4096,
+		}),
 		writes:   bf,
 		snapshot: snapshot,
 	}
 }
 
+func (f *indexedBloomFilter) ContainsWithNoFalsePositive(id []byte) bool {
+	return f.doNotWrite.Contains(id)
+}
+
 func (f *indexedBloomFilter) Write(id []byte) {
+	f.doNotWrite.SetUnsafe(id, struct{}{}, builder.IDsMapSetUnsafeOptions{
+		NoCopyKey:     true,
+		NoFinalizeKey: true,
+	})
 	f.writes.Add(id)
 	f.snapshotDirty = true
 }
@@ -125,6 +137,23 @@ func (f *indexedBloomFilter) UpdateSnapshotIfRequired() {
 	f.snapshotDirty = false
 }
 
+func (f *indexedBloomFilter) MergeSnapshot(
+	snap *indexedBloomFilterSnapshot,
+) {
+	data := f.snapshot.Bytes()
+	size := len(data)
+	if cap(snap.buffer) < size {
+		// Grow buffer if required.
+		snap.buffer = make([]byte, size)
+	} else {
+		snap.buffer = snap.buffer[:size]
+	}
+
+	for i := range snap.buffer {
+		snap.buffer[i] |= data[i]
+	}
+}
+
 type indexedBloomFilterSnapshot struct {
 	buffer      []byte
 	bloomFilter *bloom.ReadOnlyBloomFilter
@@ -147,23 +176,6 @@ func (s *indexedBloomFilterSnapshot) ReadOnlyBloomFilter() *bloom.ReadOnlyBloomF
 	return bloom.NewReadOnlyBloomFilter(bloomM, bloomK, s.buffer)
 }
 
-func (f *indexedBloomFilter) MergeSnapshot(
-	snap *indexedBloomFilterSnapshot,
-) {
-	data := f.snapshot.Bytes()
-	size := len(data)
-	if cap(snap.buffer) < size {
-		// Grow buffer if required.
-		snap.buffer = make([]byte, size)
-	} else {
-		snap.buffer = snap.buffer[:size]
-	}
-
-	for i := range snap.buffer {
-		snap.buffer[i] |= data[i]
-	}
-}
-
 type mutableSegmentsMetrics struct {
 	foregroundCompactionPlanRunLatency tally.Timer
 	foregroundCompactionTaskRunLatency tally.Timer
@@ -277,35 +289,56 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	builder := m.compact.segmentBuilder
 	m.Unlock()
 
-	// Kick off updating indexedBloomFilterByTime if needed
-	var updateIndexedBloomFilters sync.WaitGroup
+	// Updsate indexedBloomFilterByTime if needed.
 	if m.blockOpts.InMemoryBlock {
-		updateIndexedBloomFilters.Add(1)
-		go func() {
-			m.indexedBloomFilterByTimeLock.Lock()
-			defer func() {
-				updateIndexedBloomFilters.Done()
-				m.indexedBloomFilterByTimeLock.Unlock()
-			}()
-
-			// Update bloom filters.
-			entries := inserts.PendingEntries()
-			docs := inserts.PendingDocs()
-			for i := range entries {
-				blockStart := entries[i].indexBlockStart(m.blockSize)
-				bloomFilter, ok := m.indexedBloomFilterByTime[blockStart]
-				if !ok {
-					bloomFilter = newIndexedBloomFilter()
-					m.indexedBloomFilterByTime[blockStart] = bloomFilter
+		// Take references to the pending entries docs
+		// and make sure not to touch sort order until later.
+		entries := inserts.PendingEntries()
+		docs := inserts.PendingDocs()
+
+		m.indexedBloomFilterByTimeLock.Lock()
+		// Remove for indexing anything already indexed and
+		// also update the tracking of what things have been indexed
+		// for what block starts.
+		for i := range entries {
+			blockStart := entries[i].indexBlockStart(m.blockSize)
+			needsIndex := true
+			needsBloomFilterWrite := true
+			for bloomFilterBlockStart, bloomFilter := range m.indexedBloomFilterByTime {
+				if bloomFilter.ContainsWithNoFalsePositive(docs[i].ID) {
+					// Already indexed, do not need to index.
+					needsIndex = false
+					if blockStart == bloomFilterBlockStart {
+						// Do not need to update the fact that this
+						// ID is contained by this block start.
+						needsBloomFilterWrite = false
+						break
+					}
 				}
-				bloomFilter.writes.Add(docs[i].ID)
 			}
 
-			// Update snapshots.
-			for _, bloomFilter := range m.indexedBloomFilterByTime {
-				bloomFilter.UpdateSnapshotIfRequired()
+			if !needsIndex {
+				// Mark the fact that it doesn't need indexing.
+				inserts.MarkEntrySuccess(i)
 			}
-		}()
+
+			if !needsBloomFilterWrite {
+				// No need to update the bloom filter.
+				continue
+			}
+
+			bloomFilter, ok := m.indexedBloomFilterByTime[blockStart]
+			if !ok {
+				bloomFilter = newIndexedBloomFilter()
+				m.indexedBloomFilterByTime[blockStart] = bloomFilter
+			}
+			bloomFilter.Write(docs[i].ID)
+		}
+		// Update bloom filter snapshots if required.
+		for _, bloomFilter := range m.indexedBloomFilterByTime {
+			bloomFilter.UpdateSnapshotIfRequired()
+		}
+		m.indexedBloomFilterByTimeLock.Unlock()
 	}
 
 	defer func() {
@@ -333,11 +366,6 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 		return MutableSegmentsStats{}, err
 	}
 
-	if m.blockOpts.InMemoryBlock {
-		// Wait for bloom filters to be updated.
-		updateIndexedBloomFilters.Wait()
-	}
-
 	// Return result from the original insertion since compaction was successful.
 	return MutableSegmentsStats{
 		NumForeground: result.numForeground,
@@ -492,12 +520,15 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		})
 	}
 
+	// Prepare the bloom filter to merge into from the live time windows.
+	if m.backgroundCompactIndexedSnapshot == nil {
+		m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
+	}
+	m.backgroundCompactIndexedSnapshot.Reset()
+
+	// Merge with existing live time windows.
 	m.indexedBloomFilterByTimeLock.RLock()
 	for _, bloomFilter := range m.indexedBloomFilterByTime {
-		if m.backgroundCompactIndexedSnapshot == nil {
-			m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
-		}
-		m.backgroundCompactIndexedSnapshot.Reset()
 		bloomFilter.MergeSnapshot(m.backgroundCompactIndexedSnapshot)
 	}
 	m.indexedBloomFilterByTimeLock.RUnlock()
@@ -706,21 +737,31 @@ func (m *mutableSegments) backgroundCompactWithTask(
 		logger.Debug("done compaction task", zap.Duration("took", took))
 	}
 
+	// Check if result would have resulted in an empty segment.
+	empty := err == compaction.ErrCompactorBuilderEmpty
+	if empty {
+		// Don't return the error since we need to remove the old segments
+		// by calling addCompactedSegmentFromSegmentsWithLock.
+		err = nil
+	}
 	if err != nil {
 		return err
 	}
 
-	// Add a read through cache for repeated expensive queries against
-	// background compacted segments since they can live for quite some
-	// time and accrue a large set of documents.
-	segment := m.newReadThroughSegment(compacted)
-
 	// Rotate out the replaced frozen segments and add the compacted one.
 	m.Lock()
 	defer m.Unlock()
 
+	var replaceSegment segment.Segment
+	if !empty {
+		// Add a read through cache for repeated expensive queries against
+		// background compacted segments since they can live for quite some
+		// time and accrue a large set of documents.
+		replaceSegment = m.newReadThroughSegment(compacted)
+	}
+
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments,
-		segments, segment)
+		segments, replaceSegment)
 	m.backgroundSegments = result
 
 	return nil
@@ -756,6 +797,11 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
 		}
 	}
 
+	if compacted == nil {
+		// Compacted segment was empty.
+		return result
+	}
+
 	// Return all the ones we kept plus the new compacted segment
 	return append(result, newReadableSeg(compacted, m.opts))
 }
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index fb73b85add..fa0e2c25b4 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -759,13 +759,18 @@ func (b *WriteBatch) SortByEnqueued() {
 // MarkUnmarkedEntriesSuccess marks all unmarked entries as success.
 func (b *WriteBatch) MarkUnmarkedEntriesSuccess() {
 	for idx := range b.entries {
-		if !b.entries[idx].result.Done {
-			blockStart := b.entries[idx].indexBlockStart(b.opts.IndexBlockSize)
-			b.entries[idx].OnIndexSeries.OnIndexSuccess(blockStart)
-			b.entries[idx].OnIndexSeries.OnIndexFinalize(blockStart)
-			b.entries[idx].result.Done = true
-			b.entries[idx].result.Err = nil
-		}
+		b.MarkEntrySuccess(idx)
+	}
+}
+
+// MarkEntrySuccess marks an entry as success.
+func (b *WriteBatch) MarkEntrySuccess(idx int) {
+	if !b.entries[idx].result.Done {
+		blockStart := b.entries[idx].indexBlockStart(b.opts.IndexBlockSize)
+		b.entries[idx].OnIndexSeries.OnIndexSuccess(blockStart)
+		b.entries[idx].OnIndexSeries.OnIndexFinalize(blockStart)
+		b.entries[idx].result.Done = true
+		b.entries[idx].result.Err = nil
 	}
 }
 

From 7ce03c0d4ddc21739de71e0d54d7e206851ffe77 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 20 Jan 2021 16:55:06 -0500
Subject: [PATCH 056/106] Add instrumentation

---
 src/dbnode/storage/index.go                   |  14 +-
 .../storage/index/compaction/compactor.go     |   4 +-
 src/dbnode/storage/index/mutable_segments.go  | 138 ++++++++++++------
 src/dbnode/storage/index/types.go             |  14 +-
 .../segment/builder/multi_segments_builder.go |  11 +-
 src/m3ninx/index/segment/types.go             |   6 +-
 6 files changed, 136 insertions(+), 51 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 52ef98d95c..a9765fd89f 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -836,8 +836,10 @@ func (i *nsIndex) writeBatchForBlockStart(
 
 	// Record mutable segments count foreground/background if latest block.
 	if stats := result.MutableSegmentsStats; !stats.Empty() && latest {
-		i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.NumForeground))
-		i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.NumBackground))
+		i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.Foreground.NumSegments))
+		i.metrics.latestBlockNumDocsForeground.Update(float64(stats.Foreground.NumDocs))
+		i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.Background.NumSegments))
+		i.metrics.latestBlockNumDocsBackground.Update(float64(stats.Background.NumDocs))
 	}
 
 	// Allow for duplicate write errors since due to re-indexing races
@@ -2275,7 +2277,9 @@ type nsIndexMetrics struct {
 	flushDocsNew                     tally.Counter
 	flushDocsCached                  tally.Counter
 	latestBlockNumSegmentsForeground tally.Gauge
+	latestBlockNumDocsForeground     tally.Gauge
 	latestBlockNumSegmentsBackground tally.Gauge
+	latestBlockNumDocsBackground     tally.Gauge
 
 	loadedDocsPerQuery                 tally.Histogram
 	queryExhaustiveSuccess             tally.Counter
@@ -2351,9 +2355,15 @@ func newNamespaceIndexMetrics(
 		latestBlockNumSegmentsForeground: scope.Tagged(map[string]string{
 			"segment_type": "foreground",
 		}).Gauge("latest-block-num-segments"),
+		latestBlockNumDocsForeground: scope.Tagged(map[string]string{
+			"segment_type": "foreground",
+		}).Gauge("latest-block-num-docs"),
 		latestBlockNumSegmentsBackground: scope.Tagged(map[string]string{
 			"segment_type": "background",
 		}).Gauge("latest-block-num-segments"),
+		latestBlockNumDocsBackground: scope.Tagged(map[string]string{
+			"segment_type": "background",
+		}).Gauge("latest-block-num-docs"),
 		loadedDocsPerQuery: scope.Histogram(
 			"loaded-docs-per-query",
 			tally.MustMakeExponentialValueBuckets(10, 2, 16),
diff --git a/src/dbnode/storage/index/compaction/compactor.go b/src/dbnode/storage/index/compaction/compactor.go
index 7b27b0bc66..0f806ad8a8 100644
--- a/src/dbnode/storage/index/compaction/compactor.go
+++ b/src/dbnode/storage/index/compaction/compactor.go
@@ -36,6 +36,7 @@ import (
 	"github.com/m3db/m3/src/x/mmap"
 
 	"github.com/m3db/bloom/v4"
+	"github.com/uber-go/tally"
 )
 
 var (
@@ -111,6 +112,7 @@ func NewCompactor(
 func (c *Compactor) Compact(
 	segs []segment.Segment,
 	filter *bloom.ReadOnlyBloomFilter,
+	filterCounter tally.Counter,
 	reporterOptions mmap.ReporterOptions,
 ) (fst.Segment, error) {
 	c.Lock()
@@ -121,7 +123,7 @@ func (c *Compactor) Compact(
 	}
 
 	c.builder.Reset()
-	c.builder.SetFilter(filter)
+	c.builder.SetFilter(filter, filterCounter)
 	if err := c.builder.AddSegments(segs); err != nil {
 		return nil, err
 	}
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 91303e8eed..f7c40cd3df 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -181,16 +181,41 @@ type mutableSegmentsMetrics struct {
 	foregroundCompactionTaskRunLatency tally.Timer
 	backgroundCompactionPlanRunLatency tally.Timer
 	backgroundCompactionTaskRunLatency tally.Timer
+	activeBlockIndexNew                tally.Counter
+	activeBlockIndexExists             tally.Counter
+	activeBlockBloomNew                tally.Counter
+	activeBlockBloomExists             tally.Counter
+	activeBlockBloomUpdate             tally.Counter
+	activeBlockGarbageCollectSegment   tally.Counter
+	activeBlockGarbageCollectSeries    tally.Counter
 }
 
 func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
 	foregroundScope := s.Tagged(map[string]string{"compaction-type": "foreground"})
 	backgroundScope := s.Tagged(map[string]string{"compaction-type": "background"})
+	activeBlockScope := s.SubScope("active-block")
 	return mutableSegmentsMetrics{
 		foregroundCompactionPlanRunLatency: foregroundScope.Timer("compaction-plan-run-latency"),
 		foregroundCompactionTaskRunLatency: foregroundScope.Timer("compaction-task-run-latency"),
 		backgroundCompactionPlanRunLatency: backgroundScope.Timer("compaction-plan-run-latency"),
 		backgroundCompactionTaskRunLatency: backgroundScope.Timer("compaction-task-run-latency"),
+		activeBlockIndexNew: activeBlockScope.Tagged(map[string]string{
+			"result_type": "new",
+		}).Counter("index-result"),
+		activeBlockIndexExists: activeBlockScope.Tagged(map[string]string{
+			"result_type": "exists",
+		}).Counter("index-result"),
+		activeBlockBloomNew: activeBlockScope.Tagged(map[string]string{
+			"result_type": "new",
+		}).Counter("bloom-result"),
+		activeBlockBloomExists: activeBlockScope.Tagged(map[string]string{
+			"result_type": "exists",
+		}).Counter("bloom-result"),
+		activeBlockBloomUpdate: activeBlockScope.Tagged(map[string]string{
+			"result_type": "update",
+		}).Counter("bloom-result"),
+		activeBlockGarbageCollectSegment: activeBlockScope.Counter("gc-segment"),
+		activeBlockGarbageCollectSeries:  activeBlockScope.Counter("gc-series"),
 	}
 }
 
@@ -320,13 +345,23 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 			if !needsIndex {
 				// Mark the fact that it doesn't need indexing.
 				inserts.MarkEntrySuccess(i)
+				m.metrics.activeBlockIndexExists.Inc(1)
+			} else {
+				m.metrics.activeBlockIndexNew.Inc(1)
 			}
 
 			if !needsBloomFilterWrite {
 				// No need to update the bloom filter.
+				m.metrics.activeBlockBloomExists.Inc(1)
 				continue
 			}
 
+			if !needsIndex {
+				m.metrics.activeBlockBloomUpdate.Inc(1)
+			} else {
+				m.metrics.activeBlockBloomNew.Inc(1)
+			}
+
 			bloomFilter, ok := m.indexedBloomFilterByTime[blockStart]
 			if !ok {
 				bloomFilter = newIndexedBloomFilter()
@@ -367,10 +402,7 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	}
 
 	// Return result from the original insertion since compaction was successful.
-	return MutableSegmentsStats{
-		NumForeground: result.numForeground,
-		NumBackground: result.numBackground,
-	}, insertResultErr
+	return result, insertResultErr
 }
 
 func (m *mutableSegments) AddReaders(readers []segment.Reader) ([]segment.Reader, error) {
@@ -454,14 +486,18 @@ func (m *mutableSegments) NumSegmentsAndDocs() (int64, int64) {
 	m.RLock()
 	defer m.RUnlock()
 
+	foregroundNumSegments, foregroundNumDocs := numSegmentsAndDocs(m.foregroundSegments)
+	backgroundNumSegments, backgroundNumDocs := numSegmentsAndDocs(m.backgroundSegments)
+	numSegments := foregroundNumSegments + backgroundNumSegments
+	numDocs := foregroundNumDocs + backgroundNumDocs
+	return numSegments, numDocs
+}
+
+func numSegmentsAndDocs(segs []*readableSeg) (int64, int64) {
 	var (
 		numSegments, numDocs int64
 	)
-	for _, seg := range m.foregroundSegments {
-		numSegments++
-		numDocs += seg.Segment().Size()
-	}
-	for _, seg := range m.backgroundSegments {
+	for _, seg := range segs {
 		numSegments++
 		numDocs += seg.Segment().Size()
 	}
@@ -520,21 +556,6 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		})
 	}
 
-	// Prepare the bloom filter to merge into from the live time windows.
-	if m.backgroundCompactIndexedSnapshot == nil {
-		m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
-	}
-	m.backgroundCompactIndexedSnapshot.Reset()
-
-	// Merge with existing live time windows.
-	m.indexedBloomFilterByTimeLock.RLock()
-	for _, bloomFilter := range m.indexedBloomFilterByTime {
-		bloomFilter.MergeSnapshot(m.backgroundCompactIndexedSnapshot)
-	}
-	m.indexedBloomFilterByTimeLock.RUnlock()
-
-	indexedAndActiveBloomFilter := m.backgroundCompactIndexedSnapshot.ReadOnlyBloomFilter()
-
 	plan, err := compaction.NewPlan(segs, m.opts.BackgroundCompactionPlannerOptions())
 	if err != nil {
 		instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
@@ -543,9 +564,40 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		return
 	}
 
-	if len(plan.Tasks) == 0 {
+	var indexedAndActiveBloomFilter *bloom.ReadOnlyBloomFilter
+	if m.blockOpts.InMemoryBlock {
 		// Check if any segments needs filtering.
+
+		// Prepare the bloom filter to merge into from the live time windows.
+		if m.backgroundCompactIndexedSnapshot == nil {
+			m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
+		}
+		m.backgroundCompactIndexedSnapshot.Reset()
+
+		// Merge with existing live time windows.
+		m.indexedBloomFilterByTimeLock.RLock()
+		for _, bloomFilter := range m.indexedBloomFilterByTime {
+			bloomFilter.MergeSnapshot(m.backgroundCompactIndexedSnapshot)
+		}
+		m.indexedBloomFilterByTimeLock.RUnlock()
+
+		// Now check which segments need filtering.
+		indexedAndActiveBloomFilter = m.backgroundCompactIndexedSnapshot.ReadOnlyBloomFilter()
 		for _, seg := range m.backgroundSegments {
+			alreadyHasTask := false
+			for _, task := range plan.Tasks {
+				for _, taskSegment := range task.Segments() {
+					if taskSegment == seg.Segment() {
+						alreadyHasTask = true
+						break
+					}
+				}
+			}
+			if alreadyHasTask {
+				// Skip needing to check if segment needs filtering.
+				continue
+			}
+
 			reader, err := seg.Segment().Reader()
 			if err != nil {
 				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
@@ -566,7 +618,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 			for iter.Next() {
 				d := iter.Current()
 				if !indexedAndActiveBloomFilter.Test(d.ID) {
-					// This metric is not active, likely part of a block
+					// This series is not active, likely part of a block
 					// time window that is now sealed.
 					// We need to purge it to remove memory.
 					plan.Tasks = append(plan.Tasks, compaction.Task{
@@ -579,6 +631,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 							},
 						},
 					})
+					m.metrics.activeBlockGarbageCollectSegment.Inc(1)
 					break
 				}
 			}
@@ -724,6 +777,7 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	start := time.Now()
 	compacted, err := m.compact.backgroundCompactor.Compact(segments,
 		activeBloomFilter,
+		m.metrics.activeBlockGarbageCollectSeries,
 		mmap.ReporterOptions{
 			Context: mmap.Context{
 				Name: mmapIndexBlockName,
@@ -808,7 +862,7 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
 
 func (m *mutableSegments) foregroundCompactWithBuilder(
 	builder segment.DocumentsBuilder,
-) (compactResult, error) {
+) (MutableSegmentsStats, error) {
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment.
 	m.Lock()
@@ -833,18 +887,18 @@ func (m *mutableSegments) foregroundCompactWithBuilder(
 
 	plan, err := compaction.NewPlan(segs, m.opts.ForegroundCompactionPlannerOptions())
 	if err != nil {
-		return compactResult{}, err
+		return MutableSegmentsStats{}, err
 	}
 
 	// Check plan
 	if len(plan.Tasks) == 0 {
 		// Should always generate a task when a mutable builder is passed to planner
-		return compactResult{}, errForegroundCompactorNoPlan
+		return MutableSegmentsStats{}, errForegroundCompactorNoPlan
 	}
 	if taskNumBuilders(plan.Tasks[0]) != 1 {
 		// First task of plan must include the builder, so we can avoid resetting it
 		// for the first task, but then safely reset it in consequent tasks
-		return compactResult{}, errForegroundCompactorBadPlanFirstTask
+		return MutableSegmentsStats{}, errForegroundCompactorBadPlanFirstTask
 	}
 
 	// Move any unused segments to the background.
@@ -946,17 +1000,12 @@ func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock(
 	m.maybeBackgroundCompactWithLock()
 }
 
-type compactResult struct {
-	numForeground int
-	numBackground int
-}
-
 func (m *mutableSegments) foregroundCompactWithTask(
 	builder segment.DocumentsBuilder,
 	task compaction.Task,
 	log bool,
 	logger *zap.Logger,
-) (compactResult, error) {
+) (MutableSegmentsStats, error) {
 	if log {
 		logger.Debug("start compaction task")
 	}
@@ -984,7 +1033,7 @@ func (m *mutableSegments) foregroundCompactWithTask(
 	}
 
 	if err != nil {
-		return compactResult{}, err
+		return MutableSegmentsStats{}, err
 	}
 
 	// Add a read through cache for repeated expensive queries against
@@ -999,10 +1048,17 @@ func (m *mutableSegments) foregroundCompactWithTask(
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.foregroundSegments,
 		segments, segment)
 	m.foregroundSegments = result
-
-	return compactResult{
-		numForeground: len(m.foregroundSegments),
-		numBackground: len(m.backgroundSegments),
+	foregroundNumSegments, foregroundNumDocs := numSegmentsAndDocs(m.foregroundSegments)
+	backgroundNumSegments, backgroundNumDocs := numSegmentsAndDocs(m.backgroundSegments)
+	return MutableSegmentsStats{
+		Foreground: MutableSegmentsSegmentStats{
+			NumSegments: foregroundNumSegments,
+			NumDocs:     foregroundNumDocs,
+		},
+		Background: MutableSegmentsSegmentStats{
+			NumSegments: backgroundNumSegments,
+			NumDocs:     backgroundNumDocs,
+		},
 	}, nil
 }
 
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index fa0e2c25b4..da5919bb52 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -513,13 +513,21 @@ type WriteBatchResult struct {
 // MutableSegmentsStats contains metadata about
 // an insertion into mutable segments.
 type MutableSegmentsStats struct {
-	NumForeground int
-	NumBackground int
+	Foreground MutableSegmentsSegmentStats
+	Background MutableSegmentsSegmentStats
+}
+
+// MutableSegmentsSegmentStats contains metadata about
+// a set of mutable segments segment type.
+type MutableSegmentsSegmentStats struct {
+	NumSegments int64
+	NumDocs     int64
 }
 
 // Empty returns whether stats is empty or not.
 func (s MutableSegmentsStats) Empty() bool {
-	return s.NumForeground == 0 && s.NumBackground == 0
+	return s.Foreground == MutableSegmentsSegmentStats{} &&
+		s.Background == MutableSegmentsSegmentStats{}
 }
 
 // BlockTickResult returns statistics about tick.
diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go
index 21d62b4085..525b6814e1 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_builder.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go
@@ -24,18 +24,21 @@ import (
 	"io"
 	"sort"
 
-	"github.com/m3db/bloom/v4"
 	"github.com/m3db/m3/src/m3ninx/doc"
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	xerrors "github.com/m3db/m3/src/x/errors"
+
+	"github.com/m3db/bloom/v4"
+	"github.com/uber-go/tally"
 )
 
 type builderFromSegments struct {
 	docs           []doc.Document
 	idSet          *IDsMap
 	filter         *bloom.ReadOnlyBloomFilter
+	filterCount    tally.Counter
 	segments       []segmentMetadata
 	termsIter      *termsIterFromSegments
 	segmentsOffset postings.ID
@@ -84,8 +87,9 @@ func (b *builderFromSegments) Reset() {
 	b.termsIter.clear()
 }
 
-func (b *builderFromSegments) SetFilter(filter *bloom.ReadOnlyBloomFilter) {
+func (b *builderFromSegments) SetFilter(filter *bloom.ReadOnlyBloomFilter, filterCount tally.Counter) {
 	b.filter = filter
+	b.filterCount = filterCount
 }
 
 func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
@@ -123,6 +127,9 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 			if b.filter != nil && !b.filter.Test(d.ID) {
 				// Actively filtering and ID is not contained.
 				skip = append(skip, iter.PostingsID())
+				if b.filterCount != nil {
+					b.filterCount.Inc(1)
+				}
 				continue
 			}
 			b.idSet.SetUnsafe(d.ID, struct{}{}, IDsMapSetUnsafeOptions{
diff --git a/src/m3ninx/index/segment/types.go b/src/m3ninx/index/segment/types.go
index eb71747a7e..0099e27d88 100644
--- a/src/m3ninx/index/segment/types.go
+++ b/src/m3ninx/index/segment/types.go
@@ -22,11 +22,13 @@ package segment
 
 import (
 	"errors"
-	"github.com/m3db/bloom/v4"
 
 	"github.com/m3db/m3/src/m3ninx/doc"
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
+
+	"github.com/m3db/bloom/v4"
+	"github.com/uber-go/tally"
 )
 
 var (
@@ -223,7 +225,7 @@ type SegmentsBuilder interface {
 
 	// SetFilter sets a filter on which documents to retain
 	// when building the segment.
-	SetFilter(bloom *bloom.ReadOnlyBloomFilter)
+	SetFilter(bloom *bloom.ReadOnlyBloomFilter, filterCount tally.Counter)
 
 	// AddSegments adds segments to build from.
 	AddSegments(segments []Segment) error

From 416dbfccbafbaa460cc662b7423aa5564c79cf66 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 20 Jan 2021 17:18:27 -0500
Subject: [PATCH 057/106] Fix build

---
 src/dbnode/storage/index/mutable_segments.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index f7c40cd3df..16be825957 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -586,8 +586,8 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		for _, seg := range m.backgroundSegments {
 			alreadyHasTask := false
 			for _, task := range plan.Tasks {
-				for _, taskSegment := range task.Segments() {
-					if taskSegment == seg.Segment() {
+				for _, taskSegment := range task.Segments {
+					if taskSegment.Segment == seg.Segment() {
 						alreadyHasTask = true
 						break
 					}

From 71d54ca01ca5bcbc97be1b7ef43afba250bdb645 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 20 Jan 2021 18:34:01 -0500
Subject: [PATCH 058/106] Always query the in-memory block

---
 src/dbnode/storage/index.go | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index a9765fd89f..ee364413ee 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -542,6 +542,10 @@ func (i *nsIndex) reportStats() error {
 			return err
 		}
 	}
+	// In memory block should always be open.
+	if err := i.inMemoryBlock.Stats(reporter); err != nil {
+		return err
+	}
 
 	// Update level stats.
 	for _, elem := range []struct {
@@ -1604,7 +1608,7 @@ func (i *nsIndex) queryWithSpan(
 	})
 	// NB(r): Safe to take ref to i.state.blocksDescOrderImmutable since it's
 	// immutable and we only create an iterator over it.
-	iter := newBlocksIterStackAlloc(i.state.blocksDescOrderImmutable, qryRange)
+	iter := newBlocksIterStackAlloc(i.inMemoryBlock, i.state.blocksDescOrderImmutable, qryRange)
 
 	// Can now release the lock and execute the query without holding the lock.
 	i.state.RUnlock()
@@ -1993,11 +1997,6 @@ func (i *nsIndex) updateBlockStartsWithLock() {
 		})
 	}
 
-	blocks = append(blocks, blockAndBlockStart{
-		block:      i.inMemoryBlock,
-		blockStart: xtime.ToUnixNano(i.inMemoryBlock.StartTime()),
-	})
-
 	// order in desc order (i.e. reverse chronological)
 	sort.Slice(blocks, func(i, j int) bool {
 		return blocks[i].blockStart > blocks[j].blockStart
@@ -2188,11 +2187,13 @@ func (i *nsIndex) Close() error {
 	var multiErr xerrors.MultiError
 	multiErr = multiErr.Add(i.state.insertQueue.Stop())
 
-	blocks := make([]index.Block, 0, len(i.state.blocksByTime))
+	blocks := make([]index.Block, 0, len(i.state.blocksByTime)+1)
 	for _, block := range i.state.blocksByTime {
 		blocks = append(blocks, block)
 	}
+	blocks = append(blocks, i.inMemoryBlock)
 
+	i.inMemoryBlock = nil
 	i.state.latestBlock = nil
 	i.state.blocksByTime = nil
 	i.state.blocksDescOrderImmutable = nil
@@ -2485,19 +2486,22 @@ func (shards dbShards) IDs() []uint32 {
 // blocksIterStackAlloc is a stack allocated block iterator, ensuring no
 // allocations per query.
 type blocksIterStackAlloc struct {
+	activeBlock index.Block
 	blocks      []blockAndBlockStart
 	queryRanges xtime.Ranges
 	idx         int
 }
 
 func newBlocksIterStackAlloc(
+	activeBlock index.Block,
 	blocks []blockAndBlockStart,
 	queryRanges xtime.Ranges,
 ) blocksIterStackAlloc {
 	return blocksIterStackAlloc{
+		activeBlock: activeBlock,
 		blocks:      blocks,
 		queryRanges: queryRanges,
-		idx:         -1,
+		idx:         -2,
 	}
 }
 
@@ -2509,6 +2513,11 @@ func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) {
 
 	for {
 		iter.idx++
+		if iter.idx == -1 {
+			// This will return the active block.
+			return iter, true
+		}
+
 		if iter.idx >= len(i.blocks) {
 			return iter, false
 		}
@@ -2532,5 +2541,8 @@ func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) {
 }
 
 func (i blocksIterStackAlloc) Current() index.Block {
+	if i.idx == -1 {
+		return i.activeBlock
+	}
 	return i.blocks[i.idx].block
 }

From 95ca25f2a5c55805b6b51107c530191748135dd1 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 22 Jan 2021 00:56:59 -0500
Subject: [PATCH 059/106] Only run a compaction on a background segment if
 active block starts change

---
 src/dbnode/storage/index/mutable_segments.go | 123 ++++++++++---------
 src/dbnode/storage/index/segments.go         |  21 ++--
 2 files changed, 80 insertions(+), 64 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 16be825957..a04a5dee8d 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -38,7 +38,6 @@ import (
 	"github.com/m3db/m3/src/m3ninx/index/segment/builder"
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst"
 	"github.com/m3db/m3/src/x/context"
-	xerrors "github.com/m3db/m3/src/x/errors"
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	xresource "github.com/m3db/m3/src/x/resource"
@@ -315,6 +314,7 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	m.Unlock()
 
 	// Updsate indexedBloomFilterByTime if needed.
+	var activeBlockStarts []xtime.UnixNano
 	if m.blockOpts.InMemoryBlock {
 		// Take references to the pending entries docs
 		// and make sure not to touch sort order until later.
@@ -369,8 +369,11 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 			}
 			bloomFilter.Write(docs[i].ID)
 		}
-		// Update bloom filter snapshots if required.
-		for _, bloomFilter := range m.indexedBloomFilterByTime {
+		// Update bloom filter snapshots if required and also
+		// track the active block starts.
+		activeBlockStarts = make([]xtime.UnixNano, 0, len(m.indexedBloomFilterByTime))
+		for blockStart, bloomFilter := range m.indexedBloomFilterByTime {
+			activeBlockStarts = append(activeBlockStarts, blockStart)
 			bloomFilter.UpdateSnapshotIfRequired()
 		}
 		m.indexedBloomFilterByTimeLock.Unlock()
@@ -396,7 +399,7 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment from the segment builder before we can serve reads
 	// from an FST segment.
-	result, err := m.foregroundCompactWithBuilder(builder)
+	result, err := m.foregroundCompactWithBuilder(builder, activeBlockStarts)
 	if err != nil {
 		return MutableSegmentsStats{}, err
 	}
@@ -564,10 +567,12 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		return
 	}
 
-	var indexedAndActiveBloomFilter *bloom.ReadOnlyBloomFilter
+	var (
+		activeBlockStarts []xtime.UnixNano
+		activeBloomFilter *bloom.ReadOnlyBloomFilter
+	)
 	if m.blockOpts.InMemoryBlock {
 		// Check if any segments needs filtering.
-
 		// Prepare the bloom filter to merge into from the live time windows.
 		if m.backgroundCompactIndexedSnapshot == nil {
 			m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
@@ -576,13 +581,15 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 
 		// Merge with existing live time windows.
 		m.indexedBloomFilterByTimeLock.RLock()
-		for _, bloomFilter := range m.indexedBloomFilterByTime {
+		activeBlockStarts = make([]xtime.UnixNano, 0, len(m.indexedBloomFilterByTime))
+		for blockStart, bloomFilter := range m.indexedBloomFilterByTime {
+			activeBlockStarts = append(activeBlockStarts, blockStart)
 			bloomFilter.MergeSnapshot(m.backgroundCompactIndexedSnapshot)
 		}
 		m.indexedBloomFilterByTimeLock.RUnlock()
 
 		// Now check which segments need filtering.
-		indexedAndActiveBloomFilter = m.backgroundCompactIndexedSnapshot.ReadOnlyBloomFilter()
+		activeBlockStartsAnyOutdated := false
 		for _, seg := range m.backgroundSegments {
 			alreadyHasTask := false
 			for _, task := range plan.Tasks {
@@ -598,52 +605,49 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 				continue
 			}
 
-			reader, err := seg.Segment().Reader()
-			if err != nil {
-				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-					l.Error("index background compaction plan reader error", zap.Error(err))
-				})
-				return
+			activeBlockStartsOutdated := false
+			if len(seg.containedBlockStarts) != len(activeBlockStarts) {
+				activeBlockStartsOutdated = true
+			} else {
+				for _, blockStart := range seg.containedBlockStarts {
+					found := false
+					for _, activeBlockStart := range activeBlockStarts {
+						if activeBlockStart == blockStart {
+							found = true
+							break
+						}
+					}
+					if !found {
+						activeBlockStartsOutdated = true
+						break
+					}
+				}
 			}
 
-			iter, err := reader.AllDocs()
-			if err != nil {
-				_ = reader.Close()
-				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-					l.Error("index background compaction plan iter start error", zap.Error(err))
-				})
-				return
+			if !activeBlockStartsOutdated {
+				continue
 			}
 
-			for iter.Next() {
-				d := iter.Current()
-				if !indexedAndActiveBloomFilter.Test(d.ID) {
-					// This series is not active, likely part of a block
-					// time window that is now sealed.
-					// We need to purge it to remove memory.
-					plan.Tasks = append(plan.Tasks, compaction.Task{
-						Segments: []compaction.Segment{
-							{
-								Age:     seg.Age(),
-								Size:    seg.Segment().Size(),
-								Type:    segments.FSTType,
-								Segment: seg.Segment(),
-							},
-						},
-					})
-					m.metrics.activeBlockGarbageCollectSegment.Inc(1)
-					break
-				}
-			}
-			if err := xerrors.FirstError(iter.Err(), iter.Close()); err != nil {
-				_ = reader.Close()
-				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-					l.Error("index background compaction plan iter done error", zap.Error(err))
-				})
-				return
-			}
+			// The active block starts are outdated, need to compact
+			// and remove any old data from the segment.
+			activeBlockStartsAnyOutdated = true
+			plan.Tasks = append(plan.Tasks, compaction.Task{
+				Segments: []compaction.Segment{
+					{
+						Age:     seg.Age(),
+						Size:    seg.Segment().Size(),
+						Type:    segments.FSTType,
+						Segment: seg.Segment(),
+					},
+				},
+			})
+		}
 
-			_ = reader.Close()
+		if activeBlockStartsAnyOutdated {
+			// Only set the bloom filter to actively filter series out
+			// if there were any segments that need the active block starts
+			// updated.
+			activeBloomFilter = m.backgroundCompactIndexedSnapshot.ReadOnlyBloomFilter()
 		}
 	}
 
@@ -654,7 +658,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 	// Kick off compaction.
 	m.compact.compactingBackground = true
 	go func() {
-		m.backgroundCompactWithPlan(plan, indexedAndActiveBloomFilter)
+		m.backgroundCompactWithPlan(plan, activeBlockStarts, activeBloomFilter)
 
 		m.Lock()
 		m.compact.compactingBackground = false
@@ -710,6 +714,7 @@ func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg
 
 func (m *mutableSegments) backgroundCompactWithPlan(
 	plan *compaction.Plan,
+	activeBlockStarts []xtime.UnixNano,
 	activeBloomFilter *bloom.ReadOnlyBloomFilter,
 ) {
 	sw := m.metrics.backgroundCompactionPlanRunLatency.Start()
@@ -737,8 +742,8 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 	}
 
 	for i, task := range plan.Tasks {
-		err := m.backgroundCompactWithTask(task, activeBloomFilter, log,
-			logger.With(zap.Int("task", i)))
+		err := m.backgroundCompactWithTask(task, activeBlockStarts,
+			activeBloomFilter, log, logger.With(zap.Int("task", i)))
 		if err != nil {
 			instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
 				l.Error("error compacting segments", zap.Error(err))
@@ -761,6 +766,7 @@ func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment
 
 func (m *mutableSegments) backgroundCompactWithTask(
 	task compaction.Task,
+	activeBlockStarts []xtime.UnixNano,
 	activeBloomFilter *bloom.ReadOnlyBloomFilter,
 	log bool,
 	logger *zap.Logger,
@@ -815,7 +821,7 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	}
 
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments,
-		segments, replaceSegment)
+		segments, replaceSegment, activeBlockStarts)
 	m.backgroundSegments = result
 
 	return nil
@@ -825,6 +831,7 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
 	current []*readableSeg,
 	segmentsJustCompacted []segment.Segment,
 	compacted segment.Segment,
+	activeBlockStarts []xtime.UnixNano,
 ) []*readableSeg {
 	result := make([]*readableSeg, 0, len(current))
 	for _, existing := range current {
@@ -857,11 +864,12 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
 	}
 
 	// Return all the ones we kept plus the new compacted segment
-	return append(result, newReadableSeg(compacted, m.opts))
+	return append(result, newReadableSeg(compacted, activeBlockStarts, m.opts))
 }
 
 func (m *mutableSegments) foregroundCompactWithBuilder(
 	builder segment.DocumentsBuilder,
+	activeBlockStarts []xtime.UnixNano,
 ) (MutableSegmentsStats, error) {
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment.
@@ -933,7 +941,7 @@ func (m *mutableSegments) foregroundCompactWithBuilder(
 
 	// Run the first task, without resetting the builder.
 	result, err := m.foregroundCompactWithTask(builder, plan.Tasks[0],
-		log, logger.With(zap.Int("task", 0)))
+		activeBlockStarts, log, logger.With(zap.Int("task", 0)))
 	if err != nil {
 		return result, err
 	}
@@ -950,7 +958,7 @@ func (m *mutableSegments) foregroundCompactWithBuilder(
 		// Now use the builder after resetting it.
 		builder.Reset()
 		result, err = m.foregroundCompactWithTask(builder, task,
-			log, logger.With(zap.Int("task", i)))
+			activeBlockStarts, log, logger.With(zap.Int("task", i)))
 		if err != nil {
 			return result, err
 		}
@@ -1003,6 +1011,7 @@ func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock(
 func (m *mutableSegments) foregroundCompactWithTask(
 	builder segment.DocumentsBuilder,
 	task compaction.Task,
+	activeBlockStarts []xtime.UnixNano,
 	log bool,
 	logger *zap.Logger,
 ) (MutableSegmentsStats, error) {
@@ -1046,7 +1055,7 @@ func (m *mutableSegments) foregroundCompactWithTask(
 	defer m.Unlock()
 
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.foregroundSegments,
-		segments, segment)
+		segments, segment, activeBlockStarts)
 	m.foregroundSegments = result
 	foregroundNumSegments, foregroundNumDocs := numSegmentsAndDocs(m.foregroundSegments)
 	backgroundNumSegments, backgroundNumDocs := numSegmentsAndDocs(m.backgroundSegments)
diff --git a/src/dbnode/storage/index/segments.go b/src/dbnode/storage/index/segments.go
index ce3d8ae5b3..2a7f5a44c0 100644
--- a/src/dbnode/storage/index/segments.go
+++ b/src/dbnode/storage/index/segments.go
@@ -25,20 +25,27 @@ import (
 
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/x/clock"
+	xtime "github.com/m3db/m3/src/x/time"
 )
 
 type readableSeg struct {
-	nowFn     clock.NowFn
-	createdAt time.Time
-	segment   segment.Segment
+	nowFn                clock.NowFn
+	createdAt            time.Time
+	segment              segment.Segment
+	containedBlockStarts []xtime.UnixNano
 }
 
-func newReadableSeg(seg segment.Segment, opts Options) *readableSeg {
+func newReadableSeg(
+	seg segment.Segment,
+	containedBlockStarts []xtime.UnixNano,
+	opts Options,
+) *readableSeg {
 	nowFn := opts.ClockOptions().NowFn()
 	return &readableSeg{
-		nowFn:     nowFn,
-		createdAt: nowFn(),
-		segment:   seg,
+		nowFn:                nowFn,
+		createdAt:            nowFn(),
+		segment:              seg,
+		containedBlockStarts: containedBlockStarts,
 	}
 }
 

From 9fd2d4e0451d95a0817a9d266566ddaf34fb593f Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 22 Jan 2021 18:18:23 -0500
Subject: [PATCH 060/106] Don't require acquiring other locks from
 maybeBackgroundCompactWithLock

---
 src/dbnode/storage/index/mutable_segments.go | 84 +++++++++-----------
 1 file changed, 39 insertions(+), 45 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index a04a5dee8d..34b7bffcce 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -68,9 +68,11 @@ type mutableSegments struct {
 
 	state mutableSegmentsState
 
-	foregroundSegments               []*readableSeg
-	backgroundSegments               []*readableSeg
-	backgroundCompactIndexedSnapshot *indexedBloomFilterSnapshot
+	foregroundSegments                 []*readableSeg
+	backgroundSegments                 []*readableSeg
+	indexedSnapshot                    *indexedBloomFilterSnapshot
+	backgroundCompactActiveBlockStarts []xtime.UnixNano
+	backgroundCompactIndexedSnapshot   *indexedBloomFilterSnapshot
 
 	compact                  mutableSegmentsCompact
 	blockStart               time.Time
@@ -311,6 +313,10 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 
 	m.compact.compactingForeground = true
 	builder := m.compact.segmentBuilder
+	if m.indexedSnapshot == nil {
+		m.indexedSnapshot = newIndexedBloomFilterSnapshot()
+	}
+	m.indexedSnapshot.Reset()
 	m.Unlock()
 
 	// Updsate indexedBloomFilterByTime if needed.
@@ -375,12 +381,22 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 		for blockStart, bloomFilter := range m.indexedBloomFilterByTime {
 			activeBlockStarts = append(activeBlockStarts, blockStart)
 			bloomFilter.UpdateSnapshotIfRequired()
+			bloomFilter.MergeSnapshot(m.indexedSnapshot)
 		}
 		m.indexedBloomFilterByTimeLock.Unlock()
 	}
 
 	defer func() {
 		m.Lock()
+		// Check if any segments needs filtering.
+		// Prepare the bloom filter to merge into from the live time windows.
+		m.backgroundCompactActiveBlockStarts = activeBlockStarts
+		if m.backgroundCompactIndexedSnapshot == nil {
+			m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
+		}
+		m.backgroundCompactIndexedSnapshot.buffer =
+			append(m.backgroundCompactIndexedSnapshot.buffer[:0], m.indexedSnapshot.buffer...)
+
 		m.compact.compactingForeground = false
 		m.cleanupForegroundCompactWithLock()
 		m.Unlock()
@@ -571,25 +587,14 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		activeBlockStarts []xtime.UnixNano
 		activeBloomFilter *bloom.ReadOnlyBloomFilter
 	)
-	if m.blockOpts.InMemoryBlock {
-		// Check if any segments needs filtering.
-		// Prepare the bloom filter to merge into from the live time windows.
-		if m.backgroundCompactIndexedSnapshot == nil {
-			m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
-		}
-		m.backgroundCompactIndexedSnapshot.Reset()
-
-		// Merge with existing live time windows.
-		m.indexedBloomFilterByTimeLock.RLock()
-		activeBlockStarts = make([]xtime.UnixNano, 0, len(m.indexedBloomFilterByTime))
-		for blockStart, bloomFilter := range m.indexedBloomFilterByTime {
-			activeBlockStarts = append(activeBlockStarts, blockStart)
-			bloomFilter.MergeSnapshot(m.backgroundCompactIndexedSnapshot)
-		}
-		m.indexedBloomFilterByTimeLock.RUnlock()
+	if m.blockOpts.InMemoryBlock && m.backgroundCompactIndexedSnapshot != nil {
+		// Only set the bloom filter to actively filter series out
+		// if there were any segments that need the active block starts
+		// updated.
+		activeBlockStarts = m.backgroundCompactActiveBlockStarts
+		activeBloomFilter = m.backgroundCompactIndexedSnapshot.ReadOnlyBloomFilter()
 
 		// Now check which segments need filtering.
-		activeBlockStartsAnyOutdated := false
 		for _, seg := range m.backgroundSegments {
 			alreadyHasTask := false
 			for _, task := range plan.Tasks {
@@ -606,22 +611,19 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 			}
 
 			activeBlockStartsOutdated := false
-			if len(seg.containedBlockStarts) != len(activeBlockStarts) {
-				activeBlockStartsOutdated = true
-			} else {
-				for _, blockStart := range seg.containedBlockStarts {
-					found := false
-					for _, activeBlockStart := range activeBlockStarts {
-						if activeBlockStart == blockStart {
-							found = true
-							break
-						}
-					}
-					if !found {
-						activeBlockStartsOutdated = true
+			for _, blockStart := range seg.containedBlockStarts {
+				found := false
+				for _, activeBlockStart := range activeBlockStarts {
+					if activeBlockStart == blockStart {
+						found = true
 						break
 					}
 				}
+				if !found {
+					// Contains an active block start that should be removed.
+					activeBlockStartsOutdated = true
+					break
+				}
 			}
 
 			if !activeBlockStartsOutdated {
@@ -630,7 +632,6 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 
 			// The active block starts are outdated, need to compact
 			// and remove any old data from the segment.
-			activeBlockStartsAnyOutdated = true
 			plan.Tasks = append(plan.Tasks, compaction.Task{
 				Segments: []compaction.Segment{
 					{
@@ -642,13 +643,6 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 				},
 			})
 		}
-
-		if activeBlockStartsAnyOutdated {
-			// Only set the bloom filter to actively filter series out
-			// if there were any segments that need the active block starts
-			// updated.
-			activeBloomFilter = m.backgroundCompactIndexedSnapshot.ReadOnlyBloomFilter()
-		}
 	}
 
 	if len(plan.Tasks) == 0 {
@@ -808,10 +802,6 @@ func (m *mutableSegments) backgroundCompactWithTask(
 		return err
 	}
 
-	// Rotate out the replaced frozen segments and add the compacted one.
-	m.Lock()
-	defer m.Unlock()
-
 	var replaceSegment segment.Segment
 	if !empty {
 		// Add a read through cache for repeated expensive queries against
@@ -820,6 +810,10 @@ func (m *mutableSegments) backgroundCompactWithTask(
 		replaceSegment = m.newReadThroughSegment(compacted)
 	}
 
+	// Rotate out the replaced frozen segments and add the compacted one.
+	m.Lock()
+	defer m.Unlock()
+
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments,
 		segments, replaceSegment, activeBlockStarts)
 	m.backgroundSegments = result

From e98f77e00ccb7c9fdd1fda51fd0a78e5e7bd45aa Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 22 Jan 2021 20:06:38 -0500
Subject: [PATCH 061/106] Add parallel background compaction

---
 src/dbnode/storage/index/mutable_segments.go | 27 ++++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 34b7bffcce..b66f8d4947 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -41,6 +41,7 @@ import (
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	xresource "github.com/m3db/m3/src/x/resource"
+	xsync "github.com/m3db/m3/src/x/sync"
 	xtime "github.com/m3db/m3/src/x/time"
 
 	"github.com/uber-go/tally"
@@ -73,6 +74,7 @@ type mutableSegments struct {
 	indexedSnapshot                    *indexedBloomFilterSnapshot
 	backgroundCompactActiveBlockStarts []xtime.UnixNano
 	backgroundCompactIndexedSnapshot   *indexedBloomFilterSnapshot
+	backgroundCompactWorkers           xsync.WorkerPool
 
 	compact                  mutableSegmentsCompact
 	blockStart               time.Time
@@ -230,6 +232,8 @@ func newMutableSegments(
 	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
 	iopts instrument.Options,
 ) *mutableSegments {
+	backgroundCompactWorkers := xsync.NewWorkerPool(4)
+	backgroundCompactWorkers.Init()
 	m := &mutableSegments{
 		blockStart:               blockStart,
 		blockSize:                md.Options().IndexOptions().BlockSize(),
@@ -237,6 +241,7 @@ func newMutableSegments(
 		blockOpts:                blockOpts,
 		iopts:                    iopts,
 		indexedBloomFilterByTime: make(map[xtime.UnixNano]*indexedBloomFilter),
+		backgroundCompactWorkers: backgroundCompactWorkers,
 		metrics:                  newMutableSegmentsMetrics(iopts.MetricsScope()),
 		logger:                   iopts.Logger(),
 	}
@@ -735,16 +740,22 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 		}
 	}
 
+	var wg sync.WaitGroup
 	for i, task := range plan.Tasks {
-		err := m.backgroundCompactWithTask(task, activeBlockStarts,
-			activeBloomFilter, log, logger.With(zap.Int("task", i)))
-		if err != nil {
-			instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-				l.Error("error compacting segments", zap.Error(err))
-			})
-			return
-		}
+		i, task := i, task
+		wg.Add(1)
+		m.backgroundCompactWorkers.Go(func() {
+			err := m.backgroundCompactWithTask(task, activeBlockStarts,
+				activeBloomFilter, log, logger.With(zap.Int("task", i)))
+			if err != nil {
+				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+					l.Error("error compacting segments", zap.Error(err))
+				})
+			}
+		})
 	}
+
+	wg.Wait()
 }
 
 func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment {

From 9857326b3ade20e3342af02913144c831c815efd Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 22 Jan 2021 20:18:05 -0500
Subject: [PATCH 062/106] Add wg.done()

---
 src/dbnode/storage/index/mutable_segments.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index b66f8d4947..af7b4a0209 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -745,6 +745,7 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 		i, task := i, task
 		wg.Add(1)
 		m.backgroundCompactWorkers.Go(func() {
+			defer wg.Done()
 			err := m.backgroundCompactWithTask(task, activeBlockStarts,
 				activeBloomFilter, log, logger.With(zap.Int("task", i)))
 			if err != nil {

From 23e5ff29ceda78ef4a124aed50eef22069f5d432 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 23 Jan 2021 00:00:01 -0500
Subject: [PATCH 063/106] Allocate number of background compactors

---
 src/dbnode/storage/index/mutable_segments.go | 68 ++++++++++++--------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index af7b4a0209..47e81967f0 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -41,7 +41,6 @@ import (
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	xresource "github.com/m3db/m3/src/x/resource"
-	xsync "github.com/m3db/m3/src/x/sync"
 	xtime "github.com/m3db/m3/src/x/time"
 
 	"github.com/uber-go/tally"
@@ -54,6 +53,8 @@ var (
 	errForegroundCompactorNoPlan               = errors.New("index foreground compactor failed to generate a plan")
 	errForegroundCompactorBadPlanFirstTask     = errors.New("index foreground compactor generated plan without mutable segment in first task")
 	errForegroundCompactorBadPlanSecondaryTask = errors.New("index foreground compactor generated plan with mutable segment a secondary task")
+
+	numBackgroundCompactors = int(math.Min(4, float64(runtime.NumCPU())/2))
 )
 
 type mutableSegmentsState uint
@@ -74,7 +75,6 @@ type mutableSegments struct {
 	indexedSnapshot                    *indexedBloomFilterSnapshot
 	backgroundCompactActiveBlockStarts []xtime.UnixNano
 	backgroundCompactIndexedSnapshot   *indexedBloomFilterSnapshot
-	backgroundCompactWorkers           xsync.WorkerPool
 
 	compact                  mutableSegmentsCompact
 	blockStart               time.Time
@@ -232,8 +232,6 @@ func newMutableSegments(
 	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
 	iopts instrument.Options,
 ) *mutableSegments {
-	backgroundCompactWorkers := xsync.NewWorkerPool(4)
-	backgroundCompactWorkers.Init()
 	m := &mutableSegments{
 		blockStart:               blockStart,
 		blockSize:                md.Options().IndexOptions().BlockSize(),
@@ -241,7 +239,6 @@ func newMutableSegments(
 		blockOpts:                blockOpts,
 		iopts:                    iopts,
 		indexedBloomFilterByTime: make(map[xtime.UnixNano]*indexedBloomFilter),
-		backgroundCompactWorkers: backgroundCompactWorkers,
 		metrics:                  newMutableSegmentsMetrics(iopts.MetricsScope()),
 		logger:                   iopts.Logger(),
 	}
@@ -688,16 +685,22 @@ func (m *mutableSegments) cleanupBackgroundCompactWithLock() {
 	m.backgroundSegments = nil
 
 	// Free compactor resources.
-	if m.compact.backgroundCompactor == nil {
+	if m.compact.backgroundCompactors == nil {
 		return
 	}
 
-	if err := m.compact.backgroundCompactor.Close(); err != nil {
-		instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-			l.Error("error closing index block background compactor", zap.Error(err))
-		})
+	backgroundCompactors := m.compact.backgroundCompactors
+	close(backgroundCompactors)
+
+	m.compact.backgroundCompactors = nil
+
+	for compactor := range backgroundCompactors {
+		if err := compactor.Close(); err != nil {
+			instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+				l.Error("error closing index block background compactor", zap.Error(err))
+			})
+		}
 	}
-	m.compact.backgroundCompactor = nil
 }
 
 func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg) {
@@ -744,16 +747,20 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 	for i, task := range plan.Tasks {
 		i, task := i, task
 		wg.Add(1)
-		m.backgroundCompactWorkers.Go(func() {
-			defer wg.Done()
+		compactor := <-m.compact.backgroundCompactors
+		go func() {
+			defer func() {
+				m.compact.backgroundCompactors <- compactor
+				wg.Done()
+			}()
 			err := m.backgroundCompactWithTask(task, activeBlockStarts,
-				activeBloomFilter, log, logger.With(zap.Int("task", i)))
+				activeBloomFilter, compactor, log, logger.With(zap.Int("task", i)))
 			if err != nil {
 				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
 					l.Error("error compacting segments", zap.Error(err))
 				})
 			}
-		})
+		}()
 	}
 
 	wg.Wait()
@@ -774,6 +781,7 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	task compaction.Task,
 	activeBlockStarts []xtime.UnixNano,
 	activeBloomFilter *bloom.ReadOnlyBloomFilter,
+	compactor *compaction.Compactor,
 	log bool,
 	logger *zap.Logger,
 ) error {
@@ -787,7 +795,7 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	}
 
 	start := time.Now()
-	compacted, err := m.compact.backgroundCompactor.Compact(segments,
+	compacted, err := compactor.Compact(segments,
 		activeBloomFilter,
 		m.metrics.activeBlockGarbageCollectSeries,
 		mmap.ReporterOptions{
@@ -979,7 +987,7 @@ func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock(
 	if len(segments) == 0 {
 		return
 	}
-	if m.compact.backgroundCompactor == nil {
+	if m.compact.backgroundCompactors == nil {
 		// No longer performing background compaction due to evict/close.
 		return
 	}
@@ -1124,7 +1132,7 @@ func (m *mutableSegments) cleanupCompactWithLock() {
 type mutableSegmentsCompact struct {
 	segmentBuilder       segment.CloseableDocumentsBuilder
 	foregroundCompactor  *compaction.Compactor
-	backgroundCompactor  *compaction.Compactor
+	backgroundCompactors chan *compaction.Compactor
 	compactingForeground bool
 	compactingBackground bool
 	numForeground        int
@@ -1169,16 +1177,20 @@ func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock(
 		}
 	}
 
-	if m.backgroundCompactor == nil {
-		m.backgroundCompactor, err = compaction.NewCompactor(docsPool,
-			DocumentArrayPoolCapacity,
-			opts.SegmentBuilderOptions(),
-			opts.FSTSegmentOptions(),
-			compaction.CompactorOptions{
-				MmapDocsData: blockOpts.BackgroundCompactorMmapDocsData,
-			})
-		if err != nil {
-			return err
+	if m.backgroundCompactors == nil {
+		m.backgroundCompactors = make(chan *compaction.Compactor, numBackgroundCompactors)
+		for i := 0; i < numBackgroundCompactors; i++ {
+			backgroundCompactor, err := compaction.NewCompactor(docsPool,
+				DocumentArrayPoolCapacity,
+				opts.SegmentBuilderOptions(),
+				opts.FSTSegmentOptions(),
+				compaction.CompactorOptions{
+					MmapDocsData: blockOpts.BackgroundCompactorMmapDocsData,
+				})
+			if err != nil {
+				return err
+			}
+			m.backgroundCompactors <- backgroundCompactor
 		}
 	}
 

From 23f65482f3d4bdff3419e8dacf7d3a430c21aa76 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 23 Jan 2021 12:10:31 -0500
Subject: [PATCH 064/106] Use up to num cpu / 2 for background compaction
 threads

---
 src/dbnode/namespace/namespace_runtime_options.go | 2 +-
 src/dbnode/storage/index/mutable_segments.go      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dbnode/namespace/namespace_runtime_options.go b/src/dbnode/namespace/namespace_runtime_options.go
index 26538c7a7a..323b1f45dd 100644
--- a/src/dbnode/namespace/namespace_runtime_options.go
+++ b/src/dbnode/namespace/namespace_runtime_options.go
@@ -28,7 +28,7 @@ import (
 )
 
 const (
-	defaultWriteIndexingPerCPUConcurrency = 0.75
+	defaultWriteIndexingPerCPUConcurrency = 1.0
 	defaultFlushIndexingPerCPUConcurrency = 0.25
 )
 
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 47e81967f0..066aa90227 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -29,7 +29,6 @@ import (
 	"sync"
 	"time"
 
-	"github.com/m3db/bloom/v4"
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
 	"github.com/m3db/m3/src/dbnode/storage/index/segments"
@@ -43,6 +42,7 @@ import (
 	xresource "github.com/m3db/m3/src/x/resource"
 	xtime "github.com/m3db/m3/src/x/time"
 
+	"github.com/m3db/bloom/v4"
 	"github.com/uber-go/tally"
 	"go.uber.org/zap"
 )
@@ -54,7 +54,7 @@ var (
 	errForegroundCompactorBadPlanFirstTask     = errors.New("index foreground compactor generated plan without mutable segment in first task")
 	errForegroundCompactorBadPlanSecondaryTask = errors.New("index foreground compactor generated plan with mutable segment a secondary task")
 
-	numBackgroundCompactors = int(math.Min(4, float64(runtime.NumCPU())/2))
+	numBackgroundCompactors = int(math.Max(1, float64(runtime.NumCPU())/2))
 )
 
 type mutableSegmentsState uint

From 1fb1850fba51bb75f9984df8d5da9c1dcd6384ff Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 23 Jan 2021 12:30:28 -0500
Subject: [PATCH 065/106] Use num CPU minus one for background compactors

---
 src/dbnode/storage/index/mutable_segments.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 066aa90227..ded40aeaf3 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -54,7 +54,9 @@ var (
 	errForegroundCompactorBadPlanFirstTask     = errors.New("index foreground compactor generated plan without mutable segment in first task")
 	errForegroundCompactorBadPlanSecondaryTask = errors.New("index foreground compactor generated plan with mutable segment a secondary task")
 
-	numBackgroundCompactors = int(math.Max(1, float64(runtime.NumCPU())/2))
+	// numBackgroundCompactors should use up to num CPU minus one
+	// to reserve for the foreground compactor.
+	numBackgroundCompactors = math.Max(1, float64(runtime.NumCPU())-1)
 )
 
 type mutableSegmentsState uint

From a3336257900846f7cfbce4203437ca97bd28e284 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 23 Jan 2021 12:31:34 -0500
Subject: [PATCH 066/106] Cast num CPU minus one to int

---
 src/dbnode/storage/index/mutable_segments.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index ded40aeaf3..f33f3e6c74 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -56,7 +56,7 @@ var (
 
 	// numBackgroundCompactors should use up to num CPU minus one
 	// to reserve for the foreground compactor.
-	numBackgroundCompactors = math.Max(1, float64(runtime.NumCPU())-1)
+	numBackgroundCompactors = int(math.Max(1, float64(runtime.NumCPU())-1))
 )
 
 type mutableSegmentsState uint

From ecbd20ef018f420b4ffaad7e4d25efccda921ba9 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 23 Jan 2021 17:38:02 -0500
Subject: [PATCH 067/106] Use default compaction level of 0-500k

---
 src/dbnode/storage/index/compaction/plan.go | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/dbnode/storage/index/compaction/plan.go b/src/dbnode/storage/index/compaction/plan.go
index e236f62fc6..a2c7c85819 100644
--- a/src/dbnode/storage/index/compaction/plan.go
+++ b/src/dbnode/storage/index/compaction/plan.go
@@ -35,18 +35,10 @@ var (
 
 var (
 	// DefaultLevels are the default Level(s) used for compaction options.
-	DefaultLevels = []Level{ // i.e. tiers for compaction [0, 262K), [262K, 1M), [1M, 4M)
-		Level{
+	DefaultLevels = []Level{ // i.e. tiers for compaction [0, 524K)
+		{
 			MinSizeInclusive: 0,
-			MaxSizeExclusive: 1 << 18,
-		},
-		Level{
-			MinSizeInclusive: 1 << 18,
-			MaxSizeExclusive: 1 << 20,
-		},
-		Level{
-			MinSizeInclusive: 1 << 20,
-			MaxSizeExclusive: 1 << 22,
+			MaxSizeExclusive: 1 << 19,
 		},
 	}
 

From fe9eab3aab221b486e135ebfd82049b54795b537 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 23 Jan 2021 17:43:40 -0500
Subject: [PATCH 068/106] Sort largest to smallest segments when using multi
 segments builder

---
 .../index/segment/builder/multi_segments_builder.go      | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go
index 525b6814e1..44defcad46 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_builder.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go
@@ -93,6 +93,15 @@ func (b *builderFromSegments) SetFilter(filter *bloom.ReadOnlyBloomFilter, filte
 }
 
 func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
+	// Order by largest -> smallest so that the first segment
+	// is the largest when iterating over term postings lists
+	// (which means it can be directly copied into the merged postings
+	// list via a union rather than needing to shift posting list
+	// IDs to take into account for duplicates).
+	sort.Slice(segments, func(i, j int) bool {
+		return segments[i].Size() > segments[j].Size()
+	})
+
 	// numMaxDocs can sometimes be larger than the actual number of documents
 	// since some are duplicates
 	numMaxDocs := 0

From d41284d9b61c922518f8d5c4b104ed1b14ab42cf Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 24 Jan 2021 14:56:11 -0500
Subject: [PATCH 069/106] Use maps solely for determining existence for
 compaction purposes or not

---
 .../storage/index/compaction/compactor.go     |   3 +-
 src/dbnode/storage/index/compaction/plan.go   |   4 +-
 src/dbnode/storage/index/mutable_segments.go  | 153 +++++++-----------
 .../segment/builder/multi_segments_builder.go |   7 +-
 src/m3ninx/index/segment/types.go             |   8 +-
 5 files changed, 67 insertions(+), 108 deletions(-)

diff --git a/src/dbnode/storage/index/compaction/compactor.go b/src/dbnode/storage/index/compaction/compactor.go
index 0f806ad8a8..5a481350ec 100644
--- a/src/dbnode/storage/index/compaction/compactor.go
+++ b/src/dbnode/storage/index/compaction/compactor.go
@@ -35,7 +35,6 @@ import (
 	xerrors "github.com/m3db/m3/src/x/errors"
 	"github.com/m3db/m3/src/x/mmap"
 
-	"github.com/m3db/bloom/v4"
 	"github.com/uber-go/tally"
 )
 
@@ -111,7 +110,7 @@ func NewCompactor(
 // time.
 func (c *Compactor) Compact(
 	segs []segment.Segment,
-	filter *bloom.ReadOnlyBloomFilter,
+	filter segment.DocumentsFilter,
 	filterCounter tally.Counter,
 	reporterOptions mmap.ReporterOptions,
 ) (fst.Segment, error) {
diff --git a/src/dbnode/storage/index/compaction/plan.go b/src/dbnode/storage/index/compaction/plan.go
index a2c7c85819..f5c26f2921 100644
--- a/src/dbnode/storage/index/compaction/plan.go
+++ b/src/dbnode/storage/index/compaction/plan.go
@@ -35,10 +35,10 @@ var (
 
 var (
 	// DefaultLevels are the default Level(s) used for compaction options.
-	DefaultLevels = []Level{ // i.e. tiers for compaction [0, 524K)
+	DefaultLevels = []Level{ // i.e. tiers for compaction [0, 300K)
 		{
 			MinSizeInclusive: 0,
-			MaxSizeExclusive: 1 << 19,
+			MaxSizeExclusive: 1 << 18,
 		},
 	}
 
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index f33f3e6c74..6df29db45a 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -21,7 +21,6 @@
 package index
 
 import (
-	"bytes"
 	"errors"
 	"fmt"
 	"math"
@@ -42,7 +41,6 @@ import (
 	xresource "github.com/m3db/m3/src/x/resource"
 	xtime "github.com/m3db/m3/src/x/time"
 
-	"github.com/m3db/bloom/v4"
 	"github.com/uber-go/tally"
 	"go.uber.org/zap"
 )
@@ -74,9 +72,9 @@ type mutableSegments struct {
 
 	foregroundSegments                 []*readableSeg
 	backgroundSegments                 []*readableSeg
-	indexedSnapshot                    *indexedBloomFilterSnapshot
+	indexedSnapshot                    *builder.IDsMap
 	backgroundCompactActiveBlockStarts []xtime.UnixNano
-	backgroundCompactIndexedSnapshot   *indexedBloomFilterSnapshot
+	backgroundCompactIndexedSnapshot   *builder.IDsMap
 
 	compact                  mutableSegmentsCompact
 	blockStart               time.Time
@@ -95,28 +93,14 @@ type mutableSegments struct {
 }
 
 type indexedBloomFilter struct {
-	doNotWrite    *builder.IDsMap
-	writes        *bloom.BloomFilter
-	snapshotDirty bool
-	snapshot      *bytes.Buffer
+	doNotWrite *builder.IDsMap
 }
 
-var (
-	// Estimate bloom values for 1million and 1% false positive rate.
-	// Roughly 1mb size with k:7 (hash 7 times on insert/lookup).
-	bloomM, bloomK = bloom.EstimateFalsePositiveRate(1<<20, 0.01)
-)
-
 func newIndexedBloomFilter() *indexedBloomFilter {
-	bf := bloom.NewBloomFilter(bloomM, bloomK)
-	snapshot := bytes.NewBuffer(nil)
-	_ = bf.BitSet().Write(snapshot)
 	return &indexedBloomFilter{
 		doNotWrite: builder.NewIDsMap(builder.IDsMapOptions{
 			InitialSize: 4096,
 		}),
-		writes:   bf,
-		snapshot: snapshot,
 	}
 }
 
@@ -129,56 +113,6 @@ func (f *indexedBloomFilter) Write(id []byte) {
 		NoCopyKey:     true,
 		NoFinalizeKey: true,
 	})
-	f.writes.Add(id)
-	f.snapshotDirty = true
-}
-
-func (f *indexedBloomFilter) UpdateSnapshotIfRequired() {
-	if !f.snapshotDirty {
-		return
-	}
-	f.snapshot.Truncate(0)
-	_ = f.writes.BitSet().Write(f.snapshot)
-	f.snapshotDirty = false
-}
-
-func (f *indexedBloomFilter) MergeSnapshot(
-	snap *indexedBloomFilterSnapshot,
-) {
-	data := f.snapshot.Bytes()
-	size := len(data)
-	if cap(snap.buffer) < size {
-		// Grow buffer if required.
-		snap.buffer = make([]byte, size)
-	} else {
-		snap.buffer = snap.buffer[:size]
-	}
-
-	for i := range snap.buffer {
-		snap.buffer[i] |= data[i]
-	}
-}
-
-type indexedBloomFilterSnapshot struct {
-	buffer      []byte
-	bloomFilter *bloom.ReadOnlyBloomFilter
-}
-
-func newIndexedBloomFilterSnapshot() *indexedBloomFilterSnapshot {
-	return &indexedBloomFilterSnapshot{}
-}
-
-func (s *indexedBloomFilterSnapshot) Reset() {
-	for i := range s.buffer {
-		s.buffer[i] = 0
-	}
-}
-
-func (s *indexedBloomFilterSnapshot) ReadOnlyBloomFilter() *bloom.ReadOnlyBloomFilter {
-	// In future would be good to update read only bloom filter instead
-	// of having to create a new one with the buffer (even though it's just
-	// a wrapper over the buffer.).
-	return bloom.NewReadOnlyBloomFilter(bloomM, bloomK, s.buffer)
 }
 
 type mutableSegmentsMetrics struct {
@@ -255,7 +189,10 @@ func (m *mutableSegments) NotifySealedBlocks(
 		return nil
 	}
 
+	m.Lock()
+
 	m.indexedBloomFilterByTimeLock.Lock()
+	// Remove entire time windows.
 	for _, blockStart := range sealed {
 		_, exists := m.indexedBloomFilterByTime[blockStart]
 		if !exists {
@@ -264,9 +201,22 @@ func (m *mutableSegments) NotifySealedBlocks(
 		// Remove indexed set if block now sealed.
 		delete(m.indexedBloomFilterByTime, blockStart)
 	}
+	// Remove any from the indexed snapshots.
+	for _, elem := range m.indexedSnapshot.Iter() {
+		id := elem.Key()
+		contained := false
+		for _, filter := range m.indexedBloomFilterByTime {
+			if filter.ContainsWithNoFalsePositive(id) {
+				contained = true
+				break
+			}
+		}
+		if !contained {
+			m.indexedSnapshot.Delete(id)
+		}
+	}
 	m.indexedBloomFilterByTimeLock.Unlock()
 
-	m.Lock()
 	m.maybeBackgroundCompactWithLock()
 	m.Unlock()
 
@@ -296,6 +246,11 @@ func (m *mutableSegments) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptio
 }
 
 func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats, error) {
+	// Take references to the pending entries docs
+	// and make sure not to touch sort order until later.
+	entries := inserts.PendingEntries()
+	docs := inserts.PendingDocs()
+
 	m.Lock()
 	if m.state == mutableSegmentsStateClosed {
 		m.Unlock()
@@ -316,21 +271,21 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	}
 
 	m.compact.compactingForeground = true
-	builder := m.compact.segmentBuilder
 	if m.indexedSnapshot == nil {
-		m.indexedSnapshot = newIndexedBloomFilterSnapshot()
+		m.indexedSnapshot = builder.NewIDsMap(builder.IDsMapOptions{})
+	}
+	for i := range docs {
+		m.indexedSnapshot.SetUnsafe(docs[i].ID, struct{}{}, builder.IDsMapSetUnsafeOptions{
+			NoCopyKey:     true,
+			NoFinalizeKey: true,
+		})
 	}
-	m.indexedSnapshot.Reset()
+	builder := m.compact.segmentBuilder
 	m.Unlock()
 
 	// Updsate indexedBloomFilterByTime if needed.
 	var activeBlockStarts []xtime.UnixNano
 	if m.blockOpts.InMemoryBlock {
-		// Take references to the pending entries docs
-		// and make sure not to touch sort order until later.
-		entries := inserts.PendingEntries()
-		docs := inserts.PendingDocs()
-
 		m.indexedBloomFilterByTimeLock.Lock()
 		// Remove for indexing anything already indexed and
 		// also update the tracking of what things have been indexed
@@ -382,25 +337,14 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 		// Update bloom filter snapshots if required and also
 		// track the active block starts.
 		activeBlockStarts = make([]xtime.UnixNano, 0, len(m.indexedBloomFilterByTime))
-		for blockStart, bloomFilter := range m.indexedBloomFilterByTime {
+		for blockStart := range m.indexedBloomFilterByTime {
 			activeBlockStarts = append(activeBlockStarts, blockStart)
-			bloomFilter.UpdateSnapshotIfRequired()
-			bloomFilter.MergeSnapshot(m.indexedSnapshot)
 		}
 		m.indexedBloomFilterByTimeLock.Unlock()
 	}
 
 	defer func() {
 		m.Lock()
-		// Check if any segments needs filtering.
-		// Prepare the bloom filter to merge into from the live time windows.
-		m.backgroundCompactActiveBlockStarts = activeBlockStarts
-		if m.backgroundCompactIndexedSnapshot == nil {
-			m.backgroundCompactIndexedSnapshot = newIndexedBloomFilterSnapshot()
-		}
-		m.backgroundCompactIndexedSnapshot.buffer =
-			append(m.backgroundCompactIndexedSnapshot.buffer[:0], m.indexedSnapshot.buffer...)
-
 		m.compact.compactingForeground = false
 		m.cleanupForegroundCompactWithLock()
 		m.Unlock()
@@ -589,14 +533,27 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 
 	var (
 		activeBlockStarts []xtime.UnixNano
-		activeBloomFilter *bloom.ReadOnlyBloomFilter
+		activeFilter      *builder.IDsMap
 	)
-	if m.blockOpts.InMemoryBlock && m.backgroundCompactIndexedSnapshot != nil {
+	if m.blockOpts.InMemoryBlock && m.indexedSnapshot != nil {
 		// Only set the bloom filter to actively filter series out
 		// if there were any segments that need the active block starts
 		// updated.
 		activeBlockStarts = m.backgroundCompactActiveBlockStarts
-		activeBloomFilter = m.backgroundCompactIndexedSnapshot.ReadOnlyBloomFilter()
+		activeFilter = m.backgroundCompactIndexedSnapshot
+		if activeFilter == nil {
+			activeFilter = builder.NewIDsMap(builder.IDsMapOptions{})
+			m.backgroundCompactIndexedSnapshot = activeFilter
+		}
+		// Copy the indexed snapshot map so can use it downstream safely
+		// without holding a lock.
+		activeFilter.Reset()
+		for _, elem := range m.indexedSnapshot.Iter() {
+			activeFilter.SetUnsafe(elem.Key(), struct{}{}, builder.IDsMapSetUnsafeOptions{
+				NoCopyKey:     true,
+				NoFinalizeKey: true,
+			})
+		}
 
 		// Now check which segments need filtering.
 		for _, seg := range m.backgroundSegments {
@@ -656,7 +613,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 	// Kick off compaction.
 	m.compact.compactingBackground = true
 	go func() {
-		m.backgroundCompactWithPlan(plan, activeBlockStarts, activeBloomFilter)
+		m.backgroundCompactWithPlan(plan, activeBlockStarts, activeFilter)
 
 		m.Lock()
 		m.compact.compactingBackground = false
@@ -719,7 +676,7 @@ func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg
 func (m *mutableSegments) backgroundCompactWithPlan(
 	plan *compaction.Plan,
 	activeBlockStarts []xtime.UnixNano,
-	activeBloomFilter *bloom.ReadOnlyBloomFilter,
+	activeFilter *builder.IDsMap,
 ) {
 	sw := m.metrics.backgroundCompactionPlanRunLatency.Start()
 	defer sw.Stop()
@@ -756,7 +713,7 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 				wg.Done()
 			}()
 			err := m.backgroundCompactWithTask(task, activeBlockStarts,
-				activeBloomFilter, compactor, log, logger.With(zap.Int("task", i)))
+				activeFilter, compactor, log, logger.With(zap.Int("task", i)))
 			if err != nil {
 				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
 					l.Error("error compacting segments", zap.Error(err))
@@ -782,7 +739,7 @@ func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment
 func (m *mutableSegments) backgroundCompactWithTask(
 	task compaction.Task,
 	activeBlockStarts []xtime.UnixNano,
-	activeBloomFilter *bloom.ReadOnlyBloomFilter,
+	activeFilter *builder.IDsMap,
 	compactor *compaction.Compactor,
 	log bool,
 	logger *zap.Logger,
@@ -798,7 +755,7 @@ func (m *mutableSegments) backgroundCompactWithTask(
 
 	start := time.Now()
 	compacted, err := compactor.Compact(segments,
-		activeBloomFilter,
+		activeFilter,
 		m.metrics.activeBlockGarbageCollectSeries,
 		mmap.ReporterOptions{
 			Context: mmap.Context{
diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go
index 44defcad46..545ebf63ee 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_builder.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go
@@ -30,14 +30,13 @@ import (
 	"github.com/m3db/m3/src/m3ninx/postings"
 	xerrors "github.com/m3db/m3/src/x/errors"
 
-	"github.com/m3db/bloom/v4"
 	"github.com/uber-go/tally"
 )
 
 type builderFromSegments struct {
 	docs           []doc.Document
 	idSet          *IDsMap
-	filter         *bloom.ReadOnlyBloomFilter
+	filter         segment.DocumentsFilter
 	filterCount    tally.Counter
 	segments       []segmentMetadata
 	termsIter      *termsIterFromSegments
@@ -87,7 +86,7 @@ func (b *builderFromSegments) Reset() {
 	b.termsIter.clear()
 }
 
-func (b *builderFromSegments) SetFilter(filter *bloom.ReadOnlyBloomFilter, filterCount tally.Counter) {
+func (b *builderFromSegments) SetFilter(filter segment.DocumentsFilter, filterCount tally.Counter) {
 	b.filter = filter
 	b.filterCount = filterCount
 }
@@ -133,7 +132,7 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 				skip = append(skip, iter.PostingsID())
 				continue
 			}
-			if b.filter != nil && !b.filter.Test(d.ID) {
+			if b.filter != nil && !b.filter.Contains(d.ID) {
 				// Actively filtering and ID is not contained.
 				skip = append(skip, iter.PostingsID())
 				if b.filterCount != nil {
diff --git a/src/m3ninx/index/segment/types.go b/src/m3ninx/index/segment/types.go
index 0099e27d88..dd911d5075 100644
--- a/src/m3ninx/index/segment/types.go
+++ b/src/m3ninx/index/segment/types.go
@@ -27,7 +27,6 @@ import (
 	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 
-	"github.com/m3db/bloom/v4"
 	"github.com/uber-go/tally"
 )
 
@@ -225,8 +224,13 @@ type SegmentsBuilder interface {
 
 	// SetFilter sets a filter on which documents to retain
 	// when building the segment.
-	SetFilter(bloom *bloom.ReadOnlyBloomFilter, filterCount tally.Counter)
+	SetFilter(keep DocumentsFilter, filterCount tally.Counter)
 
 	// AddSegments adds segments to build from.
 	AddSegments(segments []Segment) error
 }
+
+// DocumentsFilter is a documents filter.
+type DocumentsFilter interface {
+	Contains(id []byte) bool
+}

From e969766d805b3db540f81ca2291b8ca42d8f5285 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 24 Jan 2021 15:06:20 -0500
Subject: [PATCH 070/106] Use indexedBloomFilterByTimeLock lock

---
 src/dbnode/storage/index/compaction/plan.go  |   2 +-
 src/dbnode/storage/index/mutable_segments.go | 163 ++++++++++---------
 2 files changed, 87 insertions(+), 78 deletions(-)

diff --git a/src/dbnode/storage/index/compaction/plan.go b/src/dbnode/storage/index/compaction/plan.go
index f5c26f2921..4334e2b67e 100644
--- a/src/dbnode/storage/index/compaction/plan.go
+++ b/src/dbnode/storage/index/compaction/plan.go
@@ -35,7 +35,7 @@ var (
 
 var (
 	// DefaultLevels are the default Level(s) used for compaction options.
-	DefaultLevels = []Level{ // i.e. tiers for compaction [0, 300K)
+	DefaultLevels = []Level{ // i.e. tiers for compaction [0, 262K)
 		{
 			MinSizeInclusive: 0,
 			MaxSizeExclusive: 1 << 18,
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 6df29db45a..911b78c7ed 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -189,8 +189,6 @@ func (m *mutableSegments) NotifySealedBlocks(
 		return nil
 	}
 
-	m.Lock()
-
 	m.indexedBloomFilterByTimeLock.Lock()
 	// Remove entire time windows.
 	for _, blockStart := range sealed {
@@ -217,6 +215,7 @@ func (m *mutableSegments) NotifySealedBlocks(
 	}
 	m.indexedBloomFilterByTimeLock.Unlock()
 
+	m.Lock()
 	m.maybeBackgroundCompactWithLock()
 	m.Unlock()
 
@@ -246,11 +245,6 @@ func (m *mutableSegments) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptio
 }
 
 func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats, error) {
-	// Take references to the pending entries docs
-	// and make sure not to touch sort order until later.
-	entries := inserts.PendingEntries()
-	docs := inserts.PendingDocs()
-
 	m.Lock()
 	if m.state == mutableSegmentsStateClosed {
 		m.Unlock()
@@ -271,22 +265,29 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	}
 
 	m.compact.compactingForeground = true
-	if m.indexedSnapshot == nil {
-		m.indexedSnapshot = builder.NewIDsMap(builder.IDsMapOptions{})
-	}
-	for i := range docs {
-		m.indexedSnapshot.SetUnsafe(docs[i].ID, struct{}{}, builder.IDsMapSetUnsafeOptions{
-			NoCopyKey:     true,
-			NoFinalizeKey: true,
-		})
-	}
-	builder := m.compact.segmentBuilder
+	segmentBuilder := m.compact.segmentBuilder
 	m.Unlock()
 
 	// Updsate indexedBloomFilterByTime if needed.
 	var activeBlockStarts []xtime.UnixNano
 	if m.blockOpts.InMemoryBlock {
+		// Take references to the pending entries docs
+		// and make sure not to touch sort order until later.
+		entries := inserts.PendingEntries()
+		docs := inserts.PendingDocs()
+
 		m.indexedBloomFilterByTimeLock.Lock()
+		// Add to the indexed snapshot set.
+		if m.indexedSnapshot == nil {
+			m.indexedSnapshot = builder.NewIDsMap(builder.IDsMapOptions{})
+		}
+		for i := range docs {
+			m.indexedSnapshot.SetUnsafe(docs[i].ID, struct{}{}, builder.IDsMapSetUnsafeOptions{
+				NoCopyKey:     true,
+				NoFinalizeKey: true,
+			})
+		}
+
 		// Remove for indexing anything already indexed and
 		// also update the tracking of what things have been indexed
 		// for what block starts.
@@ -350,12 +351,12 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 		m.Unlock()
 	}()
 
-	builder.Reset()
-	insertResultErr := builder.InsertBatch(m3ninxindex.Batch{
+	segmentBuilder.Reset()
+	insertResultErr := segmentBuilder.InsertBatch(m3ninxindex.Batch{
 		Docs:                inserts.PendingDocs(),
 		AllowPartialUpdates: true,
 	})
-	if len(builder.Docs()) == 0 {
+	if len(segmentBuilder.Docs()) == 0 {
 		// No inserts, no need to compact.
 		return MutableSegmentsStats{}, insertResultErr
 	}
@@ -363,7 +364,7 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment from the segment builder before we can serve reads
 	// from an FST segment.
-	result, err := m.foregroundCompactWithBuilder(builder, activeBlockStarts)
+	result, err := m.foregroundCompactWithBuilder(segmentBuilder, activeBlockStarts)
 	if err != nil {
 		return MutableSegmentsStats{}, err
 	}
@@ -535,74 +536,82 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		activeBlockStarts []xtime.UnixNano
 		activeFilter      *builder.IDsMap
 	)
-	if m.blockOpts.InMemoryBlock && m.indexedSnapshot != nil {
-		// Only set the bloom filter to actively filter series out
-		// if there were any segments that need the active block starts
-		// updated.
-		activeBlockStarts = m.backgroundCompactActiveBlockStarts
-		activeFilter = m.backgroundCompactIndexedSnapshot
-		if activeFilter == nil {
-			activeFilter = builder.NewIDsMap(builder.IDsMapOptions{})
-			m.backgroundCompactIndexedSnapshot = activeFilter
-		}
-		// Copy the indexed snapshot map so can use it downstream safely
-		// without holding a lock.
-		activeFilter.Reset()
-		for _, elem := range m.indexedSnapshot.Iter() {
-			activeFilter.SetUnsafe(elem.Key(), struct{}{}, builder.IDsMapSetUnsafeOptions{
-				NoCopyKey:     true,
-				NoFinalizeKey: true,
-			})
+	if m.blockOpts.InMemoryBlock {
+		mayNeedFiltering := false
+		m.indexedBloomFilterByTimeLock.Lock()
+		if m.indexedSnapshot != nil {
+			mayNeedFiltering = true
+			// Only set the bloom filter to actively filter series out
+			// if there were any segments that need the active block starts
+			// updated.
+			activeBlockStarts = m.backgroundCompactActiveBlockStarts
+			activeFilter = m.backgroundCompactIndexedSnapshot
+			if activeFilter == nil {
+				activeFilter = builder.NewIDsMap(builder.IDsMapOptions{})
+				m.backgroundCompactIndexedSnapshot = activeFilter
+			}
+			// Copy the indexed snapshot map so can use it downstream safely
+			// without holding a lock.
+			activeFilter.Reset()
+			for _, elem := range m.indexedSnapshot.Iter() {
+				activeFilter.SetUnsafe(elem.Key(), struct{}{}, builder.IDsMapSetUnsafeOptions{
+					NoCopyKey:     true,
+					NoFinalizeKey: true,
+				})
+			}
 		}
+		m.indexedBloomFilterByTimeLock.Unlock()
 
 		// Now check which segments need filtering.
-		for _, seg := range m.backgroundSegments {
-			alreadyHasTask := false
-			for _, task := range plan.Tasks {
-				for _, taskSegment := range task.Segments {
-					if taskSegment.Segment == seg.Segment() {
-						alreadyHasTask = true
-						break
+		if mayNeedFiltering {
+			for _, seg := range m.backgroundSegments {
+				alreadyHasTask := false
+				for _, task := range plan.Tasks {
+					for _, taskSegment := range task.Segments {
+						if taskSegment.Segment == seg.Segment() {
+							alreadyHasTask = true
+							break
+						}
 					}
 				}
-			}
-			if alreadyHasTask {
-				// Skip needing to check if segment needs filtering.
-				continue
-			}
+				if alreadyHasTask {
+					// Skip needing to check if segment needs filtering.
+					continue
+				}
 
-			activeBlockStartsOutdated := false
-			for _, blockStart := range seg.containedBlockStarts {
-				found := false
-				for _, activeBlockStart := range activeBlockStarts {
-					if activeBlockStart == blockStart {
-						found = true
+				activeBlockStartsOutdated := false
+				for _, blockStart := range seg.containedBlockStarts {
+					found := false
+					for _, activeBlockStart := range activeBlockStarts {
+						if activeBlockStart == blockStart {
+							found = true
+							break
+						}
+					}
+					if !found {
+						// Contains an active block start that should be removed.
+						activeBlockStartsOutdated = true
 						break
 					}
 				}
-				if !found {
-					// Contains an active block start that should be removed.
-					activeBlockStartsOutdated = true
-					break
-				}
-			}
 
-			if !activeBlockStartsOutdated {
-				continue
-			}
+				if !activeBlockStartsOutdated {
+					continue
+				}
 
-			// The active block starts are outdated, need to compact
-			// and remove any old data from the segment.
-			plan.Tasks = append(plan.Tasks, compaction.Task{
-				Segments: []compaction.Segment{
-					{
-						Age:     seg.Age(),
-						Size:    seg.Segment().Size(),
-						Type:    segments.FSTType,
-						Segment: seg.Segment(),
+				// The active block starts are outdated, need to compact
+				// and remove any old data from the segment.
+				plan.Tasks = append(plan.Tasks, compaction.Task{
+					Segments: []compaction.Segment{
+						{
+							Age:     seg.Age(),
+							Size:    seg.Segment().Size(),
+							Type:    segments.FSTType,
+							Segment: seg.Segment(),
+						},
 					},
-				},
-			})
+				})
+			}
 		}
 	}
 

From c25a7bcec16a3d94621e6b5450559845b0613fa8 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 24 Jan 2021 15:49:19 -0500
Subject: [PATCH 071/106] Fix cast to interface

---
 src/dbnode/storage/index/mutable_segments.go | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 911b78c7ed..4c77c64baa 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -534,7 +534,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 
 	var (
 		activeBlockStarts []xtime.UnixNano
-		activeFilter      *builder.IDsMap
+		activeFilter      segment.DocumentsFilter
 	)
 	if m.blockOpts.InMemoryBlock {
 		mayNeedFiltering := false
@@ -545,20 +545,19 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 			// if there were any segments that need the active block starts
 			// updated.
 			activeBlockStarts = m.backgroundCompactActiveBlockStarts
-			activeFilter = m.backgroundCompactIndexedSnapshot
-			if activeFilter == nil {
-				activeFilter = builder.NewIDsMap(builder.IDsMapOptions{})
-				m.backgroundCompactIndexedSnapshot = activeFilter
+			if m.backgroundCompactIndexedSnapshot == nil {
+				m.backgroundCompactIndexedSnapshot = builder.NewIDsMap(builder.IDsMapOptions{})
 			}
 			// Copy the indexed snapshot map so can use it downstream safely
 			// without holding a lock.
-			activeFilter.Reset()
+			m.backgroundCompactIndexedSnapshot.Reset()
 			for _, elem := range m.indexedSnapshot.Iter() {
-				activeFilter.SetUnsafe(elem.Key(), struct{}{}, builder.IDsMapSetUnsafeOptions{
+				m.backgroundCompactIndexedSnapshot.SetUnsafe(elem.Key(), struct{}{}, builder.IDsMapSetUnsafeOptions{
 					NoCopyKey:     true,
 					NoFinalizeKey: true,
 				})
 			}
+			activeFilter = m.backgroundCompactIndexedSnapshot
 		}
 		m.indexedBloomFilterByTimeLock.Unlock()
 
@@ -685,7 +684,7 @@ func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg
 func (m *mutableSegments) backgroundCompactWithPlan(
 	plan *compaction.Plan,
 	activeBlockStarts []xtime.UnixNano,
-	activeFilter *builder.IDsMap,
+	activeFilter segment.DocumentsFilter,
 ) {
 	sw := m.metrics.backgroundCompactionPlanRunLatency.Start()
 	defer sw.Stop()
@@ -748,7 +747,7 @@ func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment
 func (m *mutableSegments) backgroundCompactWithTask(
 	task compaction.Task,
 	activeBlockStarts []xtime.UnixNano,
-	activeFilter *builder.IDsMap,
+	activeFilter segment.DocumentsFilter,
 	compactor *compaction.Compactor,
 	log bool,
 	logger *zap.Logger,

From 976bf85d59c594b06bd793237f2038e565b84800 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 24 Jan 2021 18:38:11 -0500
Subject: [PATCH 072/106] Allocation indexedSnapshot at creation

---
 src/dbnode/storage/index/mutable_segments.go | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 4c77c64baa..19cea8837a 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -175,6 +175,7 @@ func newMutableSegments(
 		blockOpts:                blockOpts,
 		iopts:                    iopts,
 		indexedBloomFilterByTime: make(map[xtime.UnixNano]*indexedBloomFilter),
+		indexedSnapshot:          builder.NewIDsMap(builder.IDsMapOptions{}),
 		metrics:                  newMutableSegmentsMetrics(iopts.MetricsScope()),
 		logger:                   iopts.Logger(),
 	}
@@ -278,9 +279,6 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 
 		m.indexedBloomFilterByTimeLock.Lock()
 		// Add to the indexed snapshot set.
-		if m.indexedSnapshot == nil {
-			m.indexedSnapshot = builder.NewIDsMap(builder.IDsMapOptions{})
-		}
 		for i := range docs {
 			m.indexedSnapshot.SetUnsafe(docs[i].ID, struct{}{}, builder.IDsMapSetUnsafeOptions{
 				NoCopyKey:     true,
@@ -539,7 +537,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 	if m.blockOpts.InMemoryBlock {
 		mayNeedFiltering := false
 		m.indexedBloomFilterByTimeLock.Lock()
-		if m.indexedSnapshot != nil {
+		if m.indexedSnapshot.Len() > 0 {
 			mayNeedFiltering = true
 			// Only set the bloom filter to actively filter series out
 			// if there were any segments that need the active block starts

From 6284b7bb63ed320321480fbc0f7c51f92fb7465f Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 24 Jan 2021 18:52:08 -0500
Subject: [PATCH 073/106] Fix build

---
 src/dbnode/storage/index/mutable_segments.go | 39 +++++++++++---------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 19cea8837a..d042f60daa 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -344,6 +344,7 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 
 	defer func() {
 		m.Lock()
+		m.backgroundCompactActiveBlockStarts = activeBlockStarts
 		m.compact.compactingForeground = false
 		m.cleanupForegroundCompactWithLock()
 		m.Unlock()
@@ -532,30 +533,13 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 
 	var (
 		activeBlockStarts []xtime.UnixNano
-		activeFilter      segment.DocumentsFilter
 	)
 	if m.blockOpts.InMemoryBlock {
 		mayNeedFiltering := false
+		activeBlockStarts = m.backgroundCompactActiveBlockStarts
 		m.indexedBloomFilterByTimeLock.Lock()
 		if m.indexedSnapshot.Len() > 0 {
 			mayNeedFiltering = true
-			// Only set the bloom filter to actively filter series out
-			// if there were any segments that need the active block starts
-			// updated.
-			activeBlockStarts = m.backgroundCompactActiveBlockStarts
-			if m.backgroundCompactIndexedSnapshot == nil {
-				m.backgroundCompactIndexedSnapshot = builder.NewIDsMap(builder.IDsMapOptions{})
-			}
-			// Copy the indexed snapshot map so can use it downstream safely
-			// without holding a lock.
-			m.backgroundCompactIndexedSnapshot.Reset()
-			for _, elem := range m.indexedSnapshot.Iter() {
-				m.backgroundCompactIndexedSnapshot.SetUnsafe(elem.Key(), struct{}{}, builder.IDsMapSetUnsafeOptions{
-					NoCopyKey:     true,
-					NoFinalizeKey: true,
-				})
-			}
-			activeFilter = m.backgroundCompactIndexedSnapshot
 		}
 		m.indexedBloomFilterByTimeLock.Unlock()
 
@@ -619,6 +603,25 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 	// Kick off compaction.
 	m.compact.compactingBackground = true
 	go func() {
+		var activeFilter segment.DocumentsFilter
+		m.indexedBloomFilterByTimeLock.Lock()
+		if m.indexedSnapshot.Len() > 0 {
+			// Only set the bloom filter to actively filter series out
+			// if there were any segments that need the active block starts
+			// updated.
+			// Copy the indexed snapshot map so can use it downstream safely
+			// without holding a lock.
+			m.backgroundCompactIndexedSnapshot.Reset()
+			for _, elem := range m.indexedSnapshot.Iter() {
+				m.backgroundCompactIndexedSnapshot.SetUnsafe(elem.Key(), struct{}{}, builder.IDsMapSetUnsafeOptions{
+					NoCopyKey:     true,
+					NoFinalizeKey: true,
+				})
+			}
+			activeFilter = m.backgroundCompactIndexedSnapshot
+		}
+		m.indexedBloomFilterByTimeLock.Unlock()
+
 		m.backgroundCompactWithPlan(plan, activeBlockStarts, activeFilter)
 
 		m.Lock()

From 0aed2f5ebc73657df5ebd5137febc61b9ab392fd Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 24 Jan 2021 19:06:53 -0500
Subject: [PATCH 074/106] Allocate backgroundCompactIndexedSnapshot when
 creating mutable segments

---
 src/dbnode/storage/index/mutable_segments.go | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index d042f60daa..cd9e8d4159 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -169,15 +169,16 @@ func newMutableSegments(
 	iopts instrument.Options,
 ) *mutableSegments {
 	m := &mutableSegments{
-		blockStart:               blockStart,
-		blockSize:                md.Options().IndexOptions().BlockSize(),
-		opts:                     opts,
-		blockOpts:                blockOpts,
-		iopts:                    iopts,
-		indexedBloomFilterByTime: make(map[xtime.UnixNano]*indexedBloomFilter),
-		indexedSnapshot:          builder.NewIDsMap(builder.IDsMapOptions{}),
-		metrics:                  newMutableSegmentsMetrics(iopts.MetricsScope()),
-		logger:                   iopts.Logger(),
+		blockStart:                       blockStart,
+		blockSize:                        md.Options().IndexOptions().BlockSize(),
+		opts:                             opts,
+		blockOpts:                        blockOpts,
+		iopts:                            iopts,
+		indexedBloomFilterByTime:         make(map[xtime.UnixNano]*indexedBloomFilter),
+		indexedSnapshot:                  builder.NewIDsMap(builder.IDsMapOptions{}),
+		backgroundCompactIndexedSnapshot: builder.NewIDsMap(builder.IDsMapOptions{}),
+		metrics:                          newMutableSegmentsMetrics(iopts.MetricsScope()),
+		logger:                           iopts.Logger(),
 	}
 	m.optsListener = namespaceRuntimeOptsMgr.RegisterListener(m)
 	return m

From b961dea80be2a68678d561166effe161815eb1ed Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 24 Jan 2021 20:18:06 -0500
Subject: [PATCH 075/106] Remove lock acquiring to check indexed snapshot

---
 src/dbnode/storage/index/mutable_segments.go | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index cd9e8d4159..1c09bab051 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -532,17 +532,13 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		return
 	}
 
-	var (
-		activeBlockStarts []xtime.UnixNano
-	)
+	var activeBlockStarts []xtime.UnixNano
 	if m.blockOpts.InMemoryBlock {
 		mayNeedFiltering := false
 		activeBlockStarts = m.backgroundCompactActiveBlockStarts
-		m.indexedBloomFilterByTimeLock.Lock()
-		if m.indexedSnapshot.Len() > 0 {
+		if len(activeBlockStarts) > 0 {
 			mayNeedFiltering = true
 		}
-		m.indexedBloomFilterByTimeLock.Unlock()
 
 		// Now check which segments need filtering.
 		if mayNeedFiltering {

From 16a1c54b5875342f6d1fa3d098237c600b262edb Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 24 Jan 2021 23:11:01 -0500
Subject: [PATCH 076/106] Call NotifySealedBlocks out of lock and only do the
 full check of whether indexedSnapshot needs update if block starts were
 removed

---
 src/dbnode/storage/index.go                  | 22 +++++++-------
 src/dbnode/storage/index/mutable_segments.go | 31 ++++++++++++--------
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index ee364413ee..6048b657f1 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -903,22 +903,27 @@ func (i *nsIndex) Bootstrapped() bool {
 }
 
 func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceIndexTickResult, error) {
-	var result namespaceIndexTickResult
-
+	var (
+		result   namespaceIndexTickResult
+		multiErr xerrors.MultiError
+	)
 	i.state.Lock()
+	sealedBlocks := make([]xtime.UnixNano, 0, len(i.state.blocksByTime))
 	defer func() {
 		i.updateBlockStartsWithLock()
+		activeBlock := i.inMemoryBlock
 		i.state.Unlock()
+		// Notify in memory block of sealed blocks
+		// and make sure to do this out of the lock since
+		// this can take a considerable amount of time
+		// and is an expensive task that doesn't require
+		// holding the index lock.
+		_ = activeBlock.InMemoryBlockNotifySealedBlocks(sealedBlocks)
 	}()
 
 	earliestBlockStartToRetain := i.earliestBlockStartToRetainWithLock(startTime)
 
 	result.NumBlocks = int64(len(i.state.blocksByTime))
-
-	var (
-		multiErr     xerrors.MultiError
-		sealedBlocks = make([]xtime.UnixNano, 0, len(i.state.blocksByTime))
-	)
 	for blockStart, block := range i.state.blocksByTime {
 		if c.IsCancelled() {
 			multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation)
@@ -963,9 +968,6 @@ func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceInd
 	result.NumTotalDocs += blockTickResult.NumDocs
 	result.FreeMmap += blockTickResult.FreeMmap
 
-	// Notify in memory block of sealed blocks.
-	multiErr = multiErr.Add(block.InMemoryBlockNotifySealedBlocks(sealedBlocks))
-
 	return result, multiErr.FinalError()
 }
 
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 1c09bab051..635b4355c7 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -191,6 +191,7 @@ func (m *mutableSegments) NotifySealedBlocks(
 		return nil
 	}
 
+	removedBlockStarts := false
 	m.indexedBloomFilterByTimeLock.Lock()
 	// Remove entire time windows.
 	for _, blockStart := range sealed {
@@ -200,19 +201,25 @@ func (m *mutableSegments) NotifySealedBlocks(
 		}
 		// Remove indexed set if block now sealed.
 		delete(m.indexedBloomFilterByTime, blockStart)
-	}
-	// Remove any from the indexed snapshots.
-	for _, elem := range m.indexedSnapshot.Iter() {
-		id := elem.Key()
-		contained := false
-		for _, filter := range m.indexedBloomFilterByTime {
-			if filter.ContainsWithNoFalsePositive(id) {
-				contained = true
-				break
+		removedBlockStarts = true
+	}
+	if removedBlockStarts {
+		// Remove any from the indexed snapshots,
+		// only do this work which is expensive
+		// if and only if there were block starts that
+		// were actually removed.
+		for _, elem := range m.indexedSnapshot.Iter() {
+			id := elem.Key()
+			contained := false
+			for _, filter := range m.indexedBloomFilterByTime {
+				if filter.ContainsWithNoFalsePositive(id) {
+					contained = true
+					break
+				}
+			}
+			if !contained {
+				m.indexedSnapshot.Delete(id)
 			}
-		}
-		if !contained {
-			m.indexedSnapshot.Delete(id)
 		}
 	}
 	m.indexedBloomFilterByTimeLock.Unlock()

From 193c4f3a83c7b7ae7caf8593a39daaa857f917fc Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 25 Jan 2021 18:06:50 -0500
Subject: [PATCH 077/106] Separate GCing background segments from foreground
 segments

---
 src/dbnode/storage/index/mutable_segments.go | 64 ++++++++++++++------
 src/dbnode/storage/index/segments.go         |  1 +
 2 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 635b4355c7..8e6852ff8a 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -516,13 +516,18 @@ func (m *mutableSegments) Close() {
 }
 
 func (m *mutableSegments) maybeBackgroundCompactWithLock() {
-	if m.compact.compactingBackground {
+	if m.compact.compactingBackgroundStandard {
 		return
 	}
 
 	// Create a logical plan.
 	segs := make([]compaction.Segment, 0, len(m.backgroundSegments))
 	for _, seg := range m.backgroundSegments {
+		if seg.garbageCollecting {
+			// Do not try to compact something that we are background
+			// garbage collecting documents from (that have been phased out).
+			continue
+		}
 		segs = append(segs, compaction.Segment{
 			Age:     seg.Age(),
 			Size:    seg.Segment().Size(),
@@ -539,7 +544,10 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		return
 	}
 
-	var activeBlockStarts []xtime.UnixNano
+	var (
+		activeBlockStarts []xtime.UnixNano
+		gcPlan            = &compaction.Plan{}
+	)
 	if m.blockOpts.InMemoryBlock {
 		mayNeedFiltering := false
 		activeBlockStarts = m.backgroundCompactActiveBlockStarts
@@ -547,8 +555,9 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 			mayNeedFiltering = true
 		}
 
-		// Now check which segments need filtering.
-		if mayNeedFiltering {
+		// Now check which segments need filtering if and only if
+		// we're not background compacting.
+		if !m.compact.compactingBackgroundGarbageCollect && mayNeedFiltering {
 			for _, seg := range m.backgroundSegments {
 				alreadyHasTask := false
 				for _, task := range plan.Tasks {
@@ -586,7 +595,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 
 				// The active block starts are outdated, need to compact
 				// and remove any old data from the segment.
-				plan.Tasks = append(plan.Tasks, compaction.Task{
+				gcPlan.Tasks = append(gcPlan.Tasks, compaction.Task{
 					Segments: []compaction.Segment{
 						{
 							Age:     seg.Age(),
@@ -596,20 +605,28 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 						},
 					},
 				})
+
+				// Mark as not-compactable for standard compactions
+				// since this will be async compacted into a smaller
+				// segment.
+				seg.garbageCollecting = true
 			}
 		}
 	}
 
-	if len(plan.Tasks) == 0 {
+	if len(plan.Tasks) == 0 && len(gcPlan.Tasks) == 0 {
 		return
 	}
 
 	// Kick off compaction.
-	m.compact.compactingBackground = true
+	m.compact.compactingBackgroundStandard = true
+	if len(gcPlan.Tasks) != 0 {
+		m.compact.compactingBackgroundGarbageCollect = true
+	}
 	go func() {
 		var activeFilter segment.DocumentsFilter
 		m.indexedBloomFilterByTimeLock.Lock()
-		if m.indexedSnapshot.Len() > 0 {
+		if n := m.indexedSnapshot.Len(); n > 0 {
 			// Only set the bloom filter to actively filter series out
 			// if there were any segments that need the active block starts
 			// updated.
@@ -626,10 +643,22 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		}
 		m.indexedBloomFilterByTimeLock.Unlock()
 
+		if len(gcPlan.Tasks) != 0 {
+			// Run non-GC tasks separately so the standard loop is not blocked.
+			go func() {
+				m.backgroundCompactWithPlan(gcPlan, activeBlockStarts, activeFilter)
+
+				m.Lock()
+				m.compact.compactingBackgroundGarbageCollect = false
+				m.cleanupBackgroundCompactWithLock()
+				m.Unlock()
+			}()
+		}
+
 		m.backgroundCompactWithPlan(plan, activeBlockStarts, activeFilter)
 
 		m.Lock()
-		m.compact.compactingBackground = false
+		m.compact.compactingBackgroundStandard = false
 		m.cleanupBackgroundCompactWithLock()
 		m.Unlock()
 	}()
@@ -1095,20 +1124,21 @@ func (m *mutableSegments) cleanupCompactWithLock() {
 	if !m.compact.compactingForeground {
 		m.cleanupForegroundCompactWithLock()
 	}
-	if !m.compact.compactingBackground {
+	if !m.compact.compactingBackgroundStandard && !m.compact.compactingBackgroundGarbageCollect {
 		m.cleanupBackgroundCompactWithLock()
 	}
 }
 
 // mutableSegmentsCompact has several lazily allocated compaction components.
 type mutableSegmentsCompact struct {
-	segmentBuilder       segment.CloseableDocumentsBuilder
-	foregroundCompactor  *compaction.Compactor
-	backgroundCompactors chan *compaction.Compactor
-	compactingForeground bool
-	compactingBackground bool
-	numForeground        int
-	numBackground        int
+	segmentBuilder                     segment.CloseableDocumentsBuilder
+	foregroundCompactor                *compaction.Compactor
+	backgroundCompactors               chan *compaction.Compactor
+	compactingForeground               bool
+	compactingBackgroundStandard       bool
+	compactingBackgroundGarbageCollect bool
+	numForeground                      int
+	numBackground                      int
 }
 
 func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock(
diff --git a/src/dbnode/storage/index/segments.go b/src/dbnode/storage/index/segments.go
index 2a7f5a44c0..2fc5dba267 100644
--- a/src/dbnode/storage/index/segments.go
+++ b/src/dbnode/storage/index/segments.go
@@ -33,6 +33,7 @@ type readableSeg struct {
 	createdAt            time.Time
 	segment              segment.Segment
 	containedBlockStarts []xtime.UnixNano
+	garbageCollecting    bool
 }
 
 func newReadableSeg(

From cdd87953347eaefc08923652fa97c9f6cc14418a Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 25 Jan 2021 18:48:10 -0500
Subject: [PATCH 078/106] Use separate number of compactors per type of
 background compaction

---
 src/dbnode/storage/index/mutable_segments.go | 77 ++++++++++++++------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 8e6852ff8a..dedade9d8d 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -52,9 +52,8 @@ var (
 	errForegroundCompactorBadPlanFirstTask     = errors.New("index foreground compactor generated plan without mutable segment in first task")
 	errForegroundCompactorBadPlanSecondaryTask = errors.New("index foreground compactor generated plan with mutable segment a secondary task")
 
-	// numBackgroundCompactors should use up to num CPU minus one
-	// to reserve for the foreground compactor.
-	numBackgroundCompactors = int(math.Max(1, float64(runtime.NumCPU())-1))
+	numBackgroundCompactorsStandard       = 1
+	numBackgroundCompactorsGarbageCollect = 1
 )
 
 type mutableSegmentsState uint
@@ -634,10 +633,11 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 			// without holding a lock.
 			m.backgroundCompactIndexedSnapshot.Reset()
 			for _, elem := range m.indexedSnapshot.Iter() {
-				m.backgroundCompactIndexedSnapshot.SetUnsafe(elem.Key(), struct{}{}, builder.IDsMapSetUnsafeOptions{
-					NoCopyKey:     true,
-					NoFinalizeKey: true,
-				})
+				m.backgroundCompactIndexedSnapshot.SetUnsafe(elem.Key(), struct{}{},
+					builder.IDsMapSetUnsafeOptions{
+						NoCopyKey:     true,
+						NoFinalizeKey: true,
+					})
 			}
 			activeFilter = m.backgroundCompactIndexedSnapshot
 		}
@@ -646,7 +646,8 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		if len(gcPlan.Tasks) != 0 {
 			// Run non-GC tasks separately so the standard loop is not blocked.
 			go func() {
-				m.backgroundCompactWithPlan(gcPlan, activeBlockStarts, activeFilter)
+				m.backgroundCompactWithPlan(gcPlan, activeBlockStarts,
+					activeFilter, m.compact.backgroundCompactorsGarbageCollect)
 
 				m.Lock()
 				m.compact.compactingBackgroundGarbageCollect = false
@@ -655,7 +656,8 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 			}()
 		}
 
-		m.backgroundCompactWithPlan(plan, activeBlockStarts, activeFilter)
+		m.backgroundCompactWithPlan(plan, activeBlockStarts,
+			activeFilter, m.compact.backgroundCompactorsStandard)
 
 		m.Lock()
 		m.compact.compactingBackgroundStandard = false
@@ -686,17 +688,24 @@ func (m *mutableSegments) cleanupBackgroundCompactWithLock() {
 	m.backgroundSegments = nil
 
 	// Free compactor resources.
-	if m.compact.backgroundCompactors == nil {
+	if m.compact.backgroundCompactorsStandard == nil {
 		return
 	}
 
-	backgroundCompactors := m.compact.backgroundCompactors
-	close(backgroundCompactors)
-
-	m.compact.backgroundCompactors = nil
+	backgroundCompactors := []chan *compaction.Compactor{
+		m.compact.backgroundCompactorsStandard,
+		m.compact.backgroundCompactorsGarbageCollect,
+	}
+	m.compact.backgroundCompactorsStandard = nil
+	m.compact.backgroundCompactorsGarbageCollect = nil
+	for _, compactors := range backgroundCompactors {
+		close(compactors)
+		for compactor := range compactors {
+			err := compactor.Close()
+			if err == nil {
+				continue
+			}
 
-	for compactor := range backgroundCompactors {
-		if err := compactor.Close(); err != nil {
 			instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
 				l.Error("error closing index block background compactor", zap.Error(err))
 			})
@@ -719,6 +728,7 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 	plan *compaction.Plan,
 	activeBlockStarts []xtime.UnixNano,
 	activeFilter segment.DocumentsFilter,
+	compactors chan *compaction.Compactor,
 ) {
 	sw := m.metrics.backgroundCompactionPlanRunLatency.Start()
 	defer sw.Stop()
@@ -748,10 +758,10 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 	for i, task := range plan.Tasks {
 		i, task := i, task
 		wg.Add(1)
-		compactor := <-m.compact.backgroundCompactors
+		compactor := <-compactors
 		go func() {
 			defer func() {
-				m.compact.backgroundCompactors <- compactor
+				compactors <- compactor
 				wg.Done()
 			}()
 			err := m.backgroundCompactWithTask(task, activeBlockStarts,
@@ -988,7 +998,7 @@ func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock(
 	if len(segments) == 0 {
 		return
 	}
-	if m.compact.backgroundCompactors == nil {
+	if m.compact.backgroundCompactorsStandard == nil {
 		// No longer performing background compaction due to evict/close.
 		return
 	}
@@ -1133,7 +1143,8 @@ func (m *mutableSegments) cleanupCompactWithLock() {
 type mutableSegmentsCompact struct {
 	segmentBuilder                     segment.CloseableDocumentsBuilder
 	foregroundCompactor                *compaction.Compactor
-	backgroundCompactors               chan *compaction.Compactor
+	backgroundCompactorsStandard       chan *compaction.Compactor
+	backgroundCompactorsGarbageCollect chan *compaction.Compactor
 	compactingForeground               bool
 	compactingBackgroundStandard       bool
 	compactingBackgroundGarbageCollect bool
@@ -1179,9 +1190,27 @@ func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock(
 		}
 	}
 
-	if m.backgroundCompactors == nil {
-		m.backgroundCompactors = make(chan *compaction.Compactor, numBackgroundCompactors)
-		for i := 0; i < numBackgroundCompactors; i++ {
+	if m.backgroundCompactorsStandard == nil {
+		n := numBackgroundCompactorsStandard
+		m.backgroundCompactorsStandard = make(chan *compaction.Compactor, n)
+		for i := 0; i < n; i++ {
+			backgroundCompactor, err := compaction.NewCompactor(docsPool,
+				DocumentArrayPoolCapacity,
+				opts.SegmentBuilderOptions(),
+				opts.FSTSegmentOptions(),
+				compaction.CompactorOptions{
+					MmapDocsData: blockOpts.BackgroundCompactorMmapDocsData,
+				})
+			if err != nil {
+				return err
+			}
+			m.backgroundCompactorsStandard <- backgroundCompactor
+		}
+	}
+	if m.backgroundCompactorsGarbageCollect == nil {
+		n := numBackgroundCompactorsGarbageCollect
+		m.backgroundCompactorsGarbageCollect = make(chan *compaction.Compactor, n)
+		for i := 0; i < n; i++ {
 			backgroundCompactor, err := compaction.NewCompactor(docsPool,
 				DocumentArrayPoolCapacity,
 				opts.SegmentBuilderOptions(),
@@ -1192,7 +1221,7 @@ func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock(
 			if err != nil {
 				return err
 			}
-			m.backgroundCompactors <- backgroundCompactor
+			m.backgroundCompactorsGarbageCollect <- backgroundCompactor
 		}
 	}
 

From 5eb19b6ed7a2864fc50faacffcedd1ead9f03782 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 26 Jan 2021 20:11:00 -0500
Subject: [PATCH 079/106] Alloc GC segments each background GC segment run,
 also do not mutate indexed snapshot during a GC run

---
 src/dbnode/storage/index/mutable_segments.go | 148 +++++++++++--------
 1 file changed, 83 insertions(+), 65 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index dedade9d8d..ff37f15c0a 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -172,6 +172,7 @@ func newMutableSegments(
 		blockSize:                        md.Options().IndexOptions().BlockSize(),
 		opts:                             opts,
 		blockOpts:                        blockOpts,
+		compact:                          mutableSegmentsCompact{opts: opts, blockOpts: blockOpts},
 		iopts:                            iopts,
 		indexedBloomFilterByTime:         make(map[xtime.UnixNano]*indexedBloomFilter),
 		indexedSnapshot:                  builder.NewIDsMap(builder.IDsMapOptions{}),
@@ -265,8 +266,7 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	}
 
 	// Lazily allocate the segment builder and compactors.
-	err := m.compact.allocLazyBuilderAndCompactorsWithLock(m.writeIndexingConcurrency,
-		m.blockOpts, m.opts)
+	err := m.compact.allocLazyBuilderAndCompactorsWithLock(m.writeIndexingConcurrency)
 	if err != nil {
 		m.Unlock()
 		return MutableSegmentsStats{}, err
@@ -546,6 +546,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 	var (
 		activeBlockStarts []xtime.UnixNano
 		gcPlan            = &compaction.Plan{}
+		gcAlreadyRunning  = m.compact.compactingBackgroundGarbageCollect
 	)
 	if m.blockOpts.InMemoryBlock {
 		mayNeedFiltering := false
@@ -556,7 +557,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 
 		// Now check which segments need filtering if and only if
 		// we're not background compacting.
-		if !m.compact.compactingBackgroundGarbageCollect && mayNeedFiltering {
+		if !gcAlreadyRunning && mayNeedFiltering {
 			for _, seg := range m.backgroundSegments {
 				alreadyHasTask := false
 				for _, task := range plan.Tasks {
@@ -629,15 +630,21 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 			// Only set the bloom filter to actively filter series out
 			// if there were any segments that need the active block starts
 			// updated.
-			// Copy the indexed snapshot map so can use it downstream safely
-			// without holding a lock.
-			m.backgroundCompactIndexedSnapshot.Reset()
-			for _, elem := range m.indexedSnapshot.Iter() {
-				m.backgroundCompactIndexedSnapshot.SetUnsafe(elem.Key(), struct{}{},
-					builder.IDsMapSetUnsafeOptions{
-						NoCopyKey:     true,
-						NoFinalizeKey: true,
-					})
+			if !gcAlreadyRunning {
+				// Make sure to only mutate the indexed snapshot
+				// if GC isn't already running since otherwise we'll be
+				// concurrently writing to the snapshot that's being used for
+				// filtering by segments that are being GC'ed.
+				// Copy the indexed snapshot map so can use it downstream safely
+				// without holding a lock.
+				m.backgroundCompactIndexedSnapshot.Reset()
+				for _, elem := range m.indexedSnapshot.Iter() {
+					m.backgroundCompactIndexedSnapshot.SetUnsafe(elem.Key(), struct{}{},
+						builder.IDsMapSetUnsafeOptions{
+							NoCopyKey:     true,
+							NoFinalizeKey: true,
+						})
+				}
 			}
 			activeFilter = m.backgroundCompactIndexedSnapshot
 		}
@@ -646,8 +653,16 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		if len(gcPlan.Tasks) != 0 {
 			// Run non-GC tasks separately so the standard loop is not blocked.
 			go func() {
-				m.backgroundCompactWithPlan(gcPlan, activeBlockStarts,
-					activeFilter, m.compact.backgroundCompactorsGarbageCollect)
+				compactors, err := m.compact.allocBackgroundCompactorsGarbageCollect()
+				if err != nil {
+					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+						l.Error("error background gc segments", zap.Error(err))
+					})
+				} else {
+					m.backgroundCompactWithPlan(gcPlan, activeBlockStarts,
+						activeFilter, compactors)
+					m.closeCompactors(compactors)
+				}
 
 				m.Lock()
 				m.compact.compactingBackgroundGarbageCollect = false
@@ -657,7 +672,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		}
 
 		m.backgroundCompactWithPlan(plan, activeBlockStarts,
-			activeFilter, m.compact.backgroundCompactorsStandard)
+			activeFilter, m.compact.backgroundCompactors)
 
 		m.Lock()
 		m.compact.compactingBackgroundStandard = false
@@ -688,28 +703,25 @@ func (m *mutableSegments) cleanupBackgroundCompactWithLock() {
 	m.backgroundSegments = nil
 
 	// Free compactor resources.
-	if m.compact.backgroundCompactorsStandard == nil {
+	if m.compact.backgroundCompactors == nil {
 		return
 	}
 
-	backgroundCompactors := []chan *compaction.Compactor{
-		m.compact.backgroundCompactorsStandard,
-		m.compact.backgroundCompactorsGarbageCollect,
-	}
-	m.compact.backgroundCompactorsStandard = nil
-	m.compact.backgroundCompactorsGarbageCollect = nil
-	for _, compactors := range backgroundCompactors {
-		close(compactors)
-		for compactor := range compactors {
-			err := compactor.Close()
-			if err == nil {
-				continue
-			}
+	m.closeCompactors(m.compact.backgroundCompactors)
+	m.compact.backgroundCompactors = nil
+}
 
-			instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-				l.Error("error closing index block background compactor", zap.Error(err))
-			})
+func (m *mutableSegments) closeCompactors(compactors chan *compaction.Compactor) {
+	close(compactors)
+	for compactor := range compactors {
+		err := compactor.Close()
+		if err == nil {
+			continue
 		}
+
+		instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+			l.Error("error closing index block background compactor", zap.Error(err))
+		})
 	}
 }
 
@@ -998,7 +1010,7 @@ func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock(
 	if len(segments) == 0 {
 		return
 	}
-	if m.compact.backgroundCompactorsStandard == nil {
+	if m.compact.backgroundCompactors == nil {
 		// No longer performing background compaction due to evict/close.
 		return
 	}
@@ -1141,10 +1153,12 @@ func (m *mutableSegments) cleanupCompactWithLock() {
 
 // mutableSegmentsCompact has several lazily allocated compaction components.
 type mutableSegmentsCompact struct {
+	opts      Options
+	blockOpts BlockOptions
+
 	segmentBuilder                     segment.CloseableDocumentsBuilder
 	foregroundCompactor                *compaction.Compactor
-	backgroundCompactorsStandard       chan *compaction.Compactor
-	backgroundCompactorsGarbageCollect chan *compaction.Compactor
+	backgroundCompactors               chan *compaction.Compactor
 	compactingForeground               bool
 	compactingBackgroundStandard       bool
 	compactingBackgroundGarbageCollect bool
@@ -1154,15 +1168,13 @@ type mutableSegmentsCompact struct {
 
 func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock(
 	concurrency int,
-	blockOpts BlockOptions,
-	opts Options,
 ) error {
 	var (
 		err      error
-		docsPool = opts.DocumentArrayPool()
+		docsPool = m.opts.DocumentArrayPool()
 	)
 	if m.segmentBuilder == nil {
-		builderOpts := opts.SegmentBuilderOptions().
+		builderOpts := m.opts.SegmentBuilderOptions().
 			SetConcurrency(concurrency)
 
 		m.segmentBuilder, err = builder.NewBuilderFromDocuments(builderOpts)
@@ -1174,8 +1186,8 @@ func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock(
 	if m.foregroundCompactor == nil {
 		m.foregroundCompactor, err = compaction.NewCompactor(docsPool,
 			DocumentArrayPoolCapacity,
-			opts.SegmentBuilderOptions(),
-			opts.FSTSegmentOptions(),
+			m.opts.SegmentBuilderOptions(),
+			m.opts.FSTSegmentOptions(),
 			compaction.CompactorOptions{
 				FSTWriterOptions: &fst.WriterOptions{
 					// DisableRegistry is set to true to trade a larger FST size
@@ -1183,51 +1195,57 @@ func (m *mutableSegmentsCompact) allocLazyBuilderAndCompactorsWithLock(
 					// to end latency for time to first index a metric.
 					DisableRegistry: true,
 				},
-				MmapDocsData: blockOpts.ForegroundCompactorMmapDocsData,
+				MmapDocsData: m.blockOpts.ForegroundCompactorMmapDocsData,
 			})
 		if err != nil {
 			return err
 		}
 	}
 
-	if m.backgroundCompactorsStandard == nil {
+	if m.backgroundCompactors == nil {
 		n := numBackgroundCompactorsStandard
-		m.backgroundCompactorsStandard = make(chan *compaction.Compactor, n)
+		m.backgroundCompactors = make(chan *compaction.Compactor, n)
 		for i := 0; i < n; i++ {
 			backgroundCompactor, err := compaction.NewCompactor(docsPool,
 				DocumentArrayPoolCapacity,
-				opts.SegmentBuilderOptions(),
-				opts.FSTSegmentOptions(),
+				m.opts.SegmentBuilderOptions(),
+				m.opts.FSTSegmentOptions(),
 				compaction.CompactorOptions{
-					MmapDocsData: blockOpts.BackgroundCompactorMmapDocsData,
+					MmapDocsData: m.blockOpts.BackgroundCompactorMmapDocsData,
 				})
 			if err != nil {
 				return err
 			}
-			m.backgroundCompactorsStandard <- backgroundCompactor
-		}
-	}
-	if m.backgroundCompactorsGarbageCollect == nil {
-		n := numBackgroundCompactorsGarbageCollect
-		m.backgroundCompactorsGarbageCollect = make(chan *compaction.Compactor, n)
-		for i := 0; i < n; i++ {
-			backgroundCompactor, err := compaction.NewCompactor(docsPool,
-				DocumentArrayPoolCapacity,
-				opts.SegmentBuilderOptions(),
-				opts.FSTSegmentOptions(),
-				compaction.CompactorOptions{
-					MmapDocsData: blockOpts.BackgroundCompactorMmapDocsData,
-				})
-			if err != nil {
-				return err
-			}
-			m.backgroundCompactorsGarbageCollect <- backgroundCompactor
+			m.backgroundCompactors <- backgroundCompactor
 		}
 	}
 
 	return nil
 }
 
+func (m *mutableSegmentsCompact) allocBackgroundCompactorsGarbageCollect() (
+	chan *compaction.Compactor,
+	error,
+) {
+	docsPool := m.opts.DocumentArrayPool()
+	n := numBackgroundCompactorsGarbageCollect
+	compactors := make(chan *compaction.Compactor, n)
+	for i := 0; i < n; i++ {
+		backgroundCompactor, err := compaction.NewCompactor(docsPool,
+			DocumentArrayPoolCapacity,
+			m.opts.SegmentBuilderOptions(),
+			m.opts.FSTSegmentOptions(),
+			compaction.CompactorOptions{
+				MmapDocsData: m.blockOpts.BackgroundCompactorMmapDocsData,
+			})
+		if err != nil {
+			return nil, err
+		}
+		compactors <- backgroundCompactor
+	}
+	return compactors, nil
+}
+
 func taskNumBuilders(task compaction.Task) int {
 	builders := 0
 	for _, seg := range task.Segments {

From ccb30354af416e8f94d032283222f6209f370722 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 28 Jan 2021 19:18:36 -0500
Subject: [PATCH 080/106] Reduce insert lock contention by spreading over many
 queues per CPU

---
 src/dbnode/storage/index_insert_queue.go | 14 +++++++++++---
 src/dbnode/storage/shard_insert_queue.go | 16 ++++++++++++----
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/dbnode/storage/index_insert_queue.go b/src/dbnode/storage/index_insert_queue.go
index 47798db952..acbbfecab7 100644
--- a/src/dbnode/storage/index_insert_queue.go
+++ b/src/dbnode/storage/index_insert_queue.go
@@ -52,6 +52,8 @@ const (
 	defaultIndexBatchBackoff = 2 * time.Millisecond
 
 	indexResetAllInsertsEvery = 3 * time.Minute
+
+	queuesPerCPUCore = 32
 )
 
 type nsIndexInsertQueue struct {
@@ -226,7 +228,13 @@ func (q *nsIndexInsertQueue) InsertPending(
 	// Note: since inserts by CPU core is allocated when
 	// nsIndexInsertBatch is constructed and then never modified
 	// it is safe to concurently read (but not modify obviously).
-	inserts := q.currBatch.insertsByCPUCore[xsync.CPUCore()]
+	queueOffset := 0
+	if batchLen > 0 {
+		// Add randomization.
+		queueOffset += int(pending[0].Entry.EnqueuedAt.UnixNano()) % queuesPerCPUCore
+	}
+	queueIdx := (xsync.CPUCore() * queuesPerCPUCore) + queueOffset
+	inserts := q.currBatch.insertsByCPUCore[queueIdx]
 	inserts.Lock()
 	firstInsert := len(inserts.batchInserts) == 0
 	inserts.batchInserts = append(inserts.batchInserts, pending...)
@@ -340,8 +348,8 @@ func newNsIndexInsertBatch(
 		namespace: namespace,
 		nowFn:     nowFn,
 	}
-	numCores := xsync.NumCores()
-	for i := 0; i < numCores; i++ {
+	numQueues := xsync.NumCores() * queuesPerCPUCore
+	for i := 0; i < numQueues; i++ {
 		b.insertsByCPUCore = append(b.insertsByCPUCore, &nsIndexInsertsByCPUCore{
 			metrics: newNamespaceIndexInsertsByCPUCoreMetrics(i, scope),
 		})
diff --git a/src/dbnode/storage/shard_insert_queue.go b/src/dbnode/storage/shard_insert_queue.go
index 794b9848a3..ac272c82d5 100644
--- a/src/dbnode/storage/shard_insert_queue.go
+++ b/src/dbnode/storage/shard_insert_queue.go
@@ -269,9 +269,10 @@ func (q *dbShardInsertQueue) Stop() error {
 }
 
 func (q *dbShardInsertQueue) Insert(insert dbShardInsert) (*sync.WaitGroup, error) {
+	now := q.nowFn()
 	if !insert.opts.skipRateLimit {
 		if limit := q.insertPerSecondLimit.Load(); limit > 0 {
-			windowNanos := uint64(q.nowFn().Truncate(time.Second).UnixNano())
+			windowNanos := uint64(now.Truncate(time.Second).UnixNano())
 			currLimitWindowNanos := q.insertPerSecondLimitWindowNanos.Load()
 			if currLimitWindowNanos != windowNanos {
 				// Rolled into a new window.
@@ -288,7 +289,14 @@ func (q *dbShardInsertQueue) Insert(insert dbShardInsert) (*sync.WaitGroup, erro
 		}
 	}
 
-	inserts := q.currBatch.insertsByCPUCore[xsync.CPUCore()]
+	// Choose the queue relevant to current CPU index.
+	// Note: since inserts by CPU core is allocated when
+	// nsIndexInsertBatch is constructed and then never modified
+	// it is safe to concurently read (but not modify obviously).
+	// Add randomization.
+	queueOffset := int(now.UnixNano()) % queuesPerCPUCore
+	queueIdx := (xsync.CPUCore() * queuesPerCPUCore) + queueOffset
+	inserts := q.currBatch.insertsByCPUCore[queueIdx]
 	inserts.Lock()
 	// Track if first insert, if so then we need to notify insert loop,
 	// otherwise we already have a pending notification.
@@ -387,8 +395,8 @@ func newDbShardInsertBatch(
 		nowFn: nowFn,
 		wg:    &sync.WaitGroup{},
 	}
-	numCores := xsync.NumCores()
-	for i := 0; i < numCores; i++ {
+	numQueues := xsync.NumCores() * queuesPerCPUCore
+	for i := 0; i < numQueues; i++ {
 		b.insertsByCPUCore = append(b.insertsByCPUCore, &dbShardInsertsByCPUCore{
 			wg:      b.wg,
 			metrics: newDBShardInsertsByCPUCoreMetrics(i, scope),

From de3ce9a695e6e65f8cabd86d3ab6fb173011520a Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 29 Jan 2021 16:15:28 -0500
Subject: [PATCH 081/106] 4x the queues per CPU core

---
 src/dbnode/storage/index_insert_queue.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dbnode/storage/index_insert_queue.go b/src/dbnode/storage/index_insert_queue.go
index acbbfecab7..00f77fae41 100644
--- a/src/dbnode/storage/index_insert_queue.go
+++ b/src/dbnode/storage/index_insert_queue.go
@@ -53,7 +53,7 @@ const (
 
 	indexResetAllInsertsEvery = 3 * time.Minute
 
-	queuesPerCPUCore = 32
+	queuesPerCPUCore = 128
 )
 
 type nsIndexInsertQueue struct {

From 834efa60d483717ba246e1e5b4bbcac440b81e96 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sun, 7 Feb 2021 03:10:58 -0500
Subject: [PATCH 082/106] Use series lookup as source of truth for if indexed
 or not and if should phase out

---
 src/dbnode/storage/index.go                   |  23 ++
 src/dbnode/storage/index/mutable_segments.go  | 383 ++++++------------
 src/dbnode/storage/index/segments.go          |  18 +-
 src/dbnode/storage/index/types.go             |  36 ++
 src/dbnode/storage/series/lookup/entry.go     | 129 +++---
 src/m3ninx/doc/document.go                    |   4 +
 .../segment/builder/multi_segments_builder.go |   2 +-
 src/m3ninx/index/segment/types.go             |  12 +-
 8 files changed, 276 insertions(+), 331 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 6048b657f1..1c3cad3609 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -606,6 +606,12 @@ func (i *nsIndex) BlockForBlockStart(blockStart time.Time) (index.Block, error)
 func (i *nsIndex) WriteBatch(
 	batch *index.WriteBatch,
 ) error {
+	// Filter anything with a pending index out before acquiring lock.
+	batch.MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize()
+	if !batch.PendingAny() {
+		return nil
+	}
+
 	i.state.RLock()
 	if !i.isOpenWithRLock() {
 		i.state.RUnlock()
@@ -648,6 +654,23 @@ func (i *nsIndex) WriteBatch(
 func (i *nsIndex) WritePending(
 	pending []writes.PendingIndexInsert,
 ) error {
+	// Filter anything with a pending index out before acquiring lock.
+	for j := 0; j < len(pending); j++ {
+		t := xtime.ToUnixNano(pending[j].Entry.Timestamp.Truncate(i.blockSize))
+		if !pending[j].Entry.OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(t) {
+			continue
+		}
+		// Remove this elem by moving tail here and shrinking by one.
+		n := len(pending)
+		pending[j] = pending[n-1]
+		pending = pending[:n-1]
+		// Reprocess element.
+		j--
+	}
+	if len(pending) == 0 {
+		return nil
+	}
+
 	i.state.RLock()
 	if !i.isOpenWithRLock() {
 		i.state.RUnlock()
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index ff37f15c0a..bb88142895 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -23,6 +23,7 @@ package index
 import (
 	"errors"
 	"fmt"
+	"github.com/m3db/m3/src/m3ninx/doc"
 	"math"
 	"runtime"
 	"sync"
@@ -69,11 +70,8 @@ type mutableSegments struct {
 
 	state mutableSegmentsState
 
-	foregroundSegments                 []*readableSeg
-	backgroundSegments                 []*readableSeg
-	indexedSnapshot                    *builder.IDsMap
-	backgroundCompactActiveBlockStarts []xtime.UnixNano
-	backgroundCompactIndexedSnapshot   *builder.IDsMap
+	foregroundSegments []*readableSeg
+	backgroundSegments []*readableSeg
 
 	compact                  mutableSegmentsCompact
 	blockStart               time.Time
@@ -84,8 +82,8 @@ type mutableSegments struct {
 	optsListener             xresource.SimpleCloser
 	writeIndexingConcurrency int
 
-	indexedBloomFilterByTimeLock sync.RWMutex
-	indexedBloomFilterByTime     map[xtime.UnixNano]*indexedBloomFilter
+	sealedBlockStarts          map[xtime.UnixNano]struct{}
+	backgroundCompactGCPending bool
 
 	metrics mutableSegmentsMetrics
 	logger  *zap.Logger
@@ -168,17 +166,15 @@ func newMutableSegments(
 	iopts instrument.Options,
 ) *mutableSegments {
 	m := &mutableSegments{
-		blockStart:                       blockStart,
-		blockSize:                        md.Options().IndexOptions().BlockSize(),
-		opts:                             opts,
-		blockOpts:                        blockOpts,
-		compact:                          mutableSegmentsCompact{opts: opts, blockOpts: blockOpts},
-		iopts:                            iopts,
-		indexedBloomFilterByTime:         make(map[xtime.UnixNano]*indexedBloomFilter),
-		indexedSnapshot:                  builder.NewIDsMap(builder.IDsMapOptions{}),
-		backgroundCompactIndexedSnapshot: builder.NewIDsMap(builder.IDsMapOptions{}),
-		metrics:                          newMutableSegmentsMetrics(iopts.MetricsScope()),
-		logger:                           iopts.Logger(),
+		blockStart:        blockStart,
+		blockSize:         md.Options().IndexOptions().BlockSize(),
+		opts:              opts,
+		blockOpts:         blockOpts,
+		compact:           mutableSegmentsCompact{opts: opts, blockOpts: blockOpts},
+		sealedBlockStarts: make(map[xtime.UnixNano]struct{}),
+		iopts:             iopts,
+		metrics:           newMutableSegmentsMetrics(iopts.MetricsScope()),
+		logger:            iopts.Logger(),
 	}
 	m.optsListener = namespaceRuntimeOptsMgr.RegisterListener(m)
 	return m
@@ -191,40 +187,11 @@ func (m *mutableSegments) NotifySealedBlocks(
 		return nil
 	}
 
-	removedBlockStarts := false
-	m.indexedBloomFilterByTimeLock.Lock()
-	// Remove entire time windows.
+	m.Lock()
 	for _, blockStart := range sealed {
-		_, exists := m.indexedBloomFilterByTime[blockStart]
-		if !exists {
-			continue
-		}
-		// Remove indexed set if block now sealed.
-		delete(m.indexedBloomFilterByTime, blockStart)
-		removedBlockStarts = true
-	}
-	if removedBlockStarts {
-		// Remove any from the indexed snapshots,
-		// only do this work which is expensive
-		// if and only if there were block starts that
-		// were actually removed.
-		for _, elem := range m.indexedSnapshot.Iter() {
-			id := elem.Key()
-			contained := false
-			for _, filter := range m.indexedBloomFilterByTime {
-				if filter.ContainsWithNoFalsePositive(id) {
-					contained = true
-					break
-				}
-			}
-			if !contained {
-				m.indexedSnapshot.Delete(id)
-			}
-		}
+		m.sealedBlockStarts[blockStart] = struct{}{}
 	}
-	m.indexedBloomFilterByTimeLock.Unlock()
-
-	m.Lock()
+	m.backgroundCompactGCPending = true
 	m.maybeBackgroundCompactWithLock()
 	m.Unlock()
 
@@ -276,90 +243,24 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	segmentBuilder := m.compact.segmentBuilder
 	m.Unlock()
 
-	// Updsate indexedBloomFilterByTime if needed.
-	var activeBlockStarts []xtime.UnixNano
-	if m.blockOpts.InMemoryBlock {
-		// Take references to the pending entries docs
-		// and make sure not to touch sort order until later.
-		entries := inserts.PendingEntries()
-		docs := inserts.PendingDocs()
-
-		m.indexedBloomFilterByTimeLock.Lock()
-		// Add to the indexed snapshot set.
-		for i := range docs {
-			m.indexedSnapshot.SetUnsafe(docs[i].ID, struct{}{}, builder.IDsMapSetUnsafeOptions{
-				NoCopyKey:     true,
-				NoFinalizeKey: true,
-			})
-		}
-
-		// Remove for indexing anything already indexed and
-		// also update the tracking of what things have been indexed
-		// for what block starts.
-		for i := range entries {
-			blockStart := entries[i].indexBlockStart(m.blockSize)
-			needsIndex := true
-			needsBloomFilterWrite := true
-			for bloomFilterBlockStart, bloomFilter := range m.indexedBloomFilterByTime {
-				if bloomFilter.ContainsWithNoFalsePositive(docs[i].ID) {
-					// Already indexed, do not need to index.
-					needsIndex = false
-					if blockStart == bloomFilterBlockStart {
-						// Do not need to update the fact that this
-						// ID is contained by this block start.
-						needsBloomFilterWrite = false
-						break
-					}
-				}
-			}
-
-			if !needsIndex {
-				// Mark the fact that it doesn't need indexing.
-				inserts.MarkEntrySuccess(i)
-				m.metrics.activeBlockIndexExists.Inc(1)
-			} else {
-				m.metrics.activeBlockIndexNew.Inc(1)
-			}
-
-			if !needsBloomFilterWrite {
-				// No need to update the bloom filter.
-				m.metrics.activeBlockBloomExists.Inc(1)
-				continue
-			}
-
-			if !needsIndex {
-				m.metrics.activeBlockBloomUpdate.Inc(1)
-			} else {
-				m.metrics.activeBlockBloomNew.Inc(1)
-			}
-
-			bloomFilter, ok := m.indexedBloomFilterByTime[blockStart]
-			if !ok {
-				bloomFilter = newIndexedBloomFilter()
-				m.indexedBloomFilterByTime[blockStart] = bloomFilter
-			}
-			bloomFilter.Write(docs[i].ID)
-		}
-		// Update bloom filter snapshots if required and also
-		// track the active block starts.
-		activeBlockStarts = make([]xtime.UnixNano, 0, len(m.indexedBloomFilterByTime))
-		for blockStart := range m.indexedBloomFilterByTime {
-			activeBlockStarts = append(activeBlockStarts, blockStart)
-		}
-		m.indexedBloomFilterByTimeLock.Unlock()
-	}
-
 	defer func() {
 		m.Lock()
-		m.backgroundCompactActiveBlockStarts = activeBlockStarts
 		m.compact.compactingForeground = false
 		m.cleanupForegroundCompactWithLock()
 		m.Unlock()
 	}()
 
+	docs := inserts.PendingDocs()
+	entries := inserts.PendingEntries()
+
+	// Set the doc ref for later recall.
+	for i := range entries {
+		docs[i].Ref = entries[i].OnIndexSeries
+	}
+
 	segmentBuilder.Reset()
 	insertResultErr := segmentBuilder.InsertBatch(m3ninxindex.Batch{
-		Docs:                inserts.PendingDocs(),
+		Docs:                docs,
 		AllowPartialUpdates: true,
 	})
 	if len(segmentBuilder.Docs()) == 0 {
@@ -370,7 +271,7 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment from the segment builder before we can serve reads
 	// from an FST segment.
-	result, err := m.foregroundCompactWithBuilder(segmentBuilder, activeBlockStarts)
+	result, err := m.foregroundCompactWithBuilder(segmentBuilder)
 	if err != nil {
 		return MutableSegmentsStats{}, err
 	}
@@ -544,141 +445,94 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 	}
 
 	var (
-		activeBlockStarts []xtime.UnixNano
 		gcPlan            = &compaction.Plan{}
 		gcAlreadyRunning  = m.compact.compactingBackgroundGarbageCollect
+		sealedBlockStarts = make(map[xtime.UnixNano]struct{}, len(m.sealedBlockStarts))
 	)
-	if m.blockOpts.InMemoryBlock {
-		mayNeedFiltering := false
-		activeBlockStarts = m.backgroundCompactActiveBlockStarts
-		if len(activeBlockStarts) > 0 {
-			mayNeedFiltering = true
-		}
-
-		// Now check which segments need filtering if and only if
-		// we're not background compacting.
-		if !gcAlreadyRunning && mayNeedFiltering {
-			for _, seg := range m.backgroundSegments {
-				alreadyHasTask := false
-				for _, task := range plan.Tasks {
-					for _, taskSegment := range task.Segments {
-						if taskSegment.Segment == seg.Segment() {
-							alreadyHasTask = true
-							break
-						}
-					}
-				}
-				if alreadyHasTask {
-					// Skip needing to check if segment needs filtering.
-					continue
-				}
-
-				activeBlockStartsOutdated := false
-				for _, blockStart := range seg.containedBlockStarts {
-					found := false
-					for _, activeBlockStart := range activeBlockStarts {
-						if activeBlockStart == blockStart {
-							found = true
-							break
-						}
-					}
-					if !found {
-						// Contains an active block start that should be removed.
-						activeBlockStartsOutdated = true
+	// Take copy of sealed block starts so can act on this
+	// async.
+	for k, v := range m.sealedBlockStarts {
+		sealedBlockStarts[k] = v
+	}
+	if !gcAlreadyRunning && m.backgroundCompactGCPending {
+		m.backgroundCompactGCPending = false
+
+		for _, seg := range m.backgroundSegments {
+			alreadyHasTask := false
+			for _, task := range plan.Tasks {
+				for _, taskSegment := range task.Segments {
+					if taskSegment.Segment == seg.Segment() {
+						alreadyHasTask = true
 						break
 					}
 				}
+			}
+			if alreadyHasTask {
+				// Skip needing to check if segment needs filtering.
+				continue
+			}
 
-				if !activeBlockStartsOutdated {
-					continue
-				}
+			// The active block starts are outdated, need to compact
+			// and remove any old data from the segment.
+			var task compaction.Task
+			if len(gcPlan.Tasks) > 0 {
+				task = gcPlan.Tasks[0]
+			}
 
-				// The active block starts are outdated, need to compact
-				// and remove any old data from the segment.
-				gcPlan.Tasks = append(gcPlan.Tasks, compaction.Task{
-					Segments: []compaction.Segment{
-						{
-							Age:     seg.Age(),
-							Size:    seg.Segment().Size(),
-							Type:    segments.FSTType,
-							Segment: seg.Segment(),
-						},
-					},
-				})
+			task.Segments = append(task.Segments, compaction.Segment{
+				Age:     seg.Age(),
+				Size:    seg.Segment().Size(),
+				Type:    segments.FSTType,
+				Segment: seg.Segment(),
+			})
 
-				// Mark as not-compactable for standard compactions
-				// since this will be async compacted into a smaller
-				// segment.
-				seg.garbageCollecting = true
+			if len(gcPlan.Tasks) == 0 {
+				gcPlan.Tasks = make([]compaction.Task, 1)
 			}
+			gcPlan.Tasks[0] = task
+
+			// Mark as not-compactable for standard compactions
+			// since this will be async compacted into a smaller
+			// segment.
+			seg.garbageCollecting = true
 		}
 	}
 
-	if len(plan.Tasks) == 0 && len(gcPlan.Tasks) == 0 {
-		return
+	if len(plan.Tasks) != 0 {
+		// Kick off compaction.
+		m.compact.compactingBackgroundStandard = true
+		go func() {
+			m.backgroundCompactWithPlan(plan, m.compact.backgroundCompactors,
+				sealedBlockStarts)
+
+			m.Lock()
+			m.compact.compactingBackgroundStandard = false
+			m.cleanupBackgroundCompactWithLock()
+			m.Unlock()
+		}()
 	}
 
-	// Kick off compaction.
-	m.compact.compactingBackgroundStandard = true
 	if len(gcPlan.Tasks) != 0 {
+		// Run non-GC tasks separately so the standard loop is not blocked.
 		m.compact.compactingBackgroundGarbageCollect = true
-	}
-	go func() {
-		var activeFilter segment.DocumentsFilter
-		m.indexedBloomFilterByTimeLock.Lock()
-		if n := m.indexedSnapshot.Len(); n > 0 {
-			// Only set the bloom filter to actively filter series out
-			// if there were any segments that need the active block starts
-			// updated.
-			if !gcAlreadyRunning {
-				// Make sure to only mutate the indexed snapshot
-				// if GC isn't already running since otherwise we'll be
-				// concurrently writing to the snapshot that's being used for
-				// filtering by segments that are being GC'ed.
-				// Copy the indexed snapshot map so can use it downstream safely
-				// without holding a lock.
-				m.backgroundCompactIndexedSnapshot.Reset()
-				for _, elem := range m.indexedSnapshot.Iter() {
-					m.backgroundCompactIndexedSnapshot.SetUnsafe(elem.Key(), struct{}{},
-						builder.IDsMapSetUnsafeOptions{
-							NoCopyKey:     true,
-							NoFinalizeKey: true,
-						})
-				}
+		go func() {
+			compactors, err := m.compact.allocBackgroundCompactorsGarbageCollect()
+			if err != nil {
+				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+					l.Error("error background gc segments", zap.Error(err))
+				})
+			} else {
+				m.backgroundCompactWithPlan(gcPlan, compactors,
+					sealedBlockStarts)
+				m.closeCompactors(compactors)
 			}
-			activeFilter = m.backgroundCompactIndexedSnapshot
-		}
-		m.indexedBloomFilterByTimeLock.Unlock()
-
-		if len(gcPlan.Tasks) != 0 {
-			// Run non-GC tasks separately so the standard loop is not blocked.
-			go func() {
-				compactors, err := m.compact.allocBackgroundCompactorsGarbageCollect()
-				if err != nil {
-					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-						l.Error("error background gc segments", zap.Error(err))
-					})
-				} else {
-					m.backgroundCompactWithPlan(gcPlan, activeBlockStarts,
-						activeFilter, compactors)
-					m.closeCompactors(compactors)
-				}
-
-				m.Lock()
-				m.compact.compactingBackgroundGarbageCollect = false
-				m.cleanupBackgroundCompactWithLock()
-				m.Unlock()
-			}()
-		}
 
-		m.backgroundCompactWithPlan(plan, activeBlockStarts,
-			activeFilter, m.compact.backgroundCompactors)
-
-		m.Lock()
-		m.compact.compactingBackgroundStandard = false
-		m.cleanupBackgroundCompactWithLock()
-		m.Unlock()
-	}()
+			m.Lock()
+			m.compact.compactingBackgroundGarbageCollect = false
+			m.cleanupBackgroundCompactWithLock()
+			m.Unlock()
+		}()
+	}
 }
 
 func (m *mutableSegments) shouldEvictCompactedSegmentsWithLock() bool {
@@ -738,9 +592,8 @@ func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg
 
 func (m *mutableSegments) backgroundCompactWithPlan(
 	plan *compaction.Plan,
-	activeBlockStarts []xtime.UnixNano,
-	activeFilter segment.DocumentsFilter,
 	compactors chan *compaction.Compactor,
+	sealedBlocks map[xtime.UnixNano]struct{},
 ) {
 	sw := m.metrics.backgroundCompactionPlanRunLatency.Start()
 	defer sw.Stop()
@@ -776,8 +629,8 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 				compactors <- compactor
 				wg.Done()
 			}()
-			err := m.backgroundCompactWithTask(task, activeBlockStarts,
-				activeFilter, compactor, log, logger.With(zap.Int("task", i)))
+			err := m.backgroundCompactWithTask(task, compactor,
+				sealedBlocks, log, logger.With(zap.Int("task", i)))
 			if err != nil {
 				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
 					l.Error("error compacting segments", zap.Error(err))
@@ -802,9 +655,8 @@ func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment
 
 func (m *mutableSegments) backgroundCompactWithTask(
 	task compaction.Task,
-	activeBlockStarts []xtime.UnixNano,
-	activeFilter segment.DocumentsFilter,
 	compactor *compaction.Compactor,
+	sealedBlocks map[xtime.UnixNano]struct{},
 	log bool,
 	logger *zap.Logger,
 ) error {
@@ -819,7 +671,27 @@ func (m *mutableSegments) backgroundCompactWithTask(
 
 	start := time.Now()
 	compacted, err := compactor.Compact(segments,
-		activeFilter,
+		segment.DocumentsFilterFn(func(d doc.Document) bool {
+			// Filter out any documents that only were indexed for
+			// sealed blocks.
+			if d.Ref == nil {
+				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+					l.Error("unexpected nil for document ref for background compact")
+				})
+				return true
+			}
+			onIndexSeries, ok := d.Ref.(OnIndexSeries)
+			if !ok {
+				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+					l.Error("unexpected type for document ref for background compact")
+				})
+				return true
+			}
+			result := onIndexSeries.RemoveIndexedForBlockStarts(sealedBlocks)
+			// Keep the series if and only if there are remaining
+			// index block starts outside of the sealed blocks starts.
+			return result.IndexedBlockStartsRemaining > 0
+		}),
 		m.metrics.activeBlockGarbageCollectSeries,
 		mmap.ReporterOptions{
 			Context: mmap.Context{
@@ -858,7 +730,7 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	defer m.Unlock()
 
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments,
-		segments, replaceSegment, activeBlockStarts)
+		segments, replaceSegment)
 	m.backgroundSegments = result
 
 	return nil
@@ -868,7 +740,6 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
 	current []*readableSeg,
 	segmentsJustCompacted []segment.Segment,
 	compacted segment.Segment,
-	activeBlockStarts []xtime.UnixNano,
 ) []*readableSeg {
 	result := make([]*readableSeg, 0, len(current))
 	for _, existing := range current {
@@ -901,12 +772,11 @@ func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
 	}
 
 	// Return all the ones we kept plus the new compacted segment
-	return append(result, newReadableSeg(compacted, activeBlockStarts, m.opts))
+	return append(result, newReadableSeg(compacted, m.opts))
 }
 
 func (m *mutableSegments) foregroundCompactWithBuilder(
 	builder segment.DocumentsBuilder,
-	activeBlockStarts []xtime.UnixNano,
 ) (MutableSegmentsStats, error) {
 	// We inserted some documents, need to compact immediately into a
 	// foreground segment.
@@ -978,7 +848,7 @@ func (m *mutableSegments) foregroundCompactWithBuilder(
 
 	// Run the first task, without resetting the builder.
 	result, err := m.foregroundCompactWithTask(builder, plan.Tasks[0],
-		activeBlockStarts, log, logger.With(zap.Int("task", 0)))
+		log, logger.With(zap.Int("task", 0)))
 	if err != nil {
 		return result, err
 	}
@@ -995,7 +865,7 @@ func (m *mutableSegments) foregroundCompactWithBuilder(
 		// Now use the builder after resetting it.
 		builder.Reset()
 		result, err = m.foregroundCompactWithTask(builder, task,
-			activeBlockStarts, log, logger.With(zap.Int("task", i)))
+			log, logger.With(zap.Int("task", i)))
 		if err != nil {
 			return result, err
 		}
@@ -1048,7 +918,6 @@ func (m *mutableSegments) maybeMoveForegroundSegmentsToBackgroundWithLock(
 func (m *mutableSegments) foregroundCompactWithTask(
 	builder segment.DocumentsBuilder,
 	task compaction.Task,
-	activeBlockStarts []xtime.UnixNano,
 	log bool,
 	logger *zap.Logger,
 ) (MutableSegmentsStats, error) {
@@ -1092,7 +961,7 @@ func (m *mutableSegments) foregroundCompactWithTask(
 	defer m.Unlock()
 
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.foregroundSegments,
-		segments, segment, activeBlockStarts)
+		segments, segment)
 	m.foregroundSegments = result
 	foregroundNumSegments, foregroundNumDocs := numSegmentsAndDocs(m.foregroundSegments)
 	backgroundNumSegments, backgroundNumDocs := numSegmentsAndDocs(m.backgroundSegments)
diff --git a/src/dbnode/storage/index/segments.go b/src/dbnode/storage/index/segments.go
index 2fc5dba267..6f3134d1c9 100644
--- a/src/dbnode/storage/index/segments.go
+++ b/src/dbnode/storage/index/segments.go
@@ -25,28 +25,24 @@ import (
 
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/x/clock"
-	xtime "github.com/m3db/m3/src/x/time"
 )
 
 type readableSeg struct {
-	nowFn                clock.NowFn
-	createdAt            time.Time
-	segment              segment.Segment
-	containedBlockStarts []xtime.UnixNano
-	garbageCollecting    bool
+	nowFn             clock.NowFn
+	createdAt         time.Time
+	segment           segment.Segment
+	garbageCollecting bool
 }
 
 func newReadableSeg(
 	seg segment.Segment,
-	containedBlockStarts []xtime.UnixNano,
 	opts Options,
 ) *readableSeg {
 	nowFn := opts.ClockOptions().NowFn()
 	return &readableSeg{
-		nowFn:                nowFn,
-		createdAt:            nowFn(),
-		segment:              seg,
-		containedBlockStarts: containedBlockStarts,
+		nowFn:     nowFn,
+		createdAt: nowFn(),
+		segment:   seg,
 	}
 }
 
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index da5919bb52..d2168aab3d 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -345,6 +345,19 @@ type OnIndexSeries interface {
 	// Further, every call to NeedsIndexUpdate which returns true needs to have a corresponding
 	// OnIndexFinalze() call. This is required for correct lifecycle maintenance.
 	NeedsIndexUpdate(indexBlockStartForWrite xtime.UnixNano) bool
+
+	IfAlreadyIndexedMarkIndexSuccessAndFinalize(
+		blockStart xtime.UnixNano,
+	) bool
+
+	RemoveIndexedForBlockStarts(
+		blockStarts map[xtime.UnixNano]struct{},
+	) RemoveIndexedForBlockStartsResult
+}
+
+type RemoveIndexedForBlockStartsResult struct {
+	IndexedBlockStartsRemoved   int
+	IndexedBlockStartsRemaining int
 }
 
 // Block represents a collection of segments. Each `Block` is a complete reverse
@@ -701,6 +714,15 @@ func (b *WriteBatch) ForEachUnmarkedBatchByBlockStart(
 	}
 }
 
+func (b *WriteBatch) PendingAny() bool {
+	for i := range b.entries {
+		if !b.entries[i].result.Done {
+			return true
+		}
+	}
+	return false
+}
+
 func (b *WriteBatch) numPending() int {
 	numUnmarked := 0
 	for i := range b.entries {
@@ -782,6 +804,20 @@ func (b *WriteBatch) MarkEntrySuccess(idx int) {
 	}
 }
 
+// MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize marks an entry as success.
+func (b *WriteBatch) MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize() {
+	for idx := range b.entries {
+		if !b.entries[idx].result.Done {
+			blockStart := b.entries[idx].indexBlockStart(b.opts.IndexBlockSize)
+			r := b.entries[idx].OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(blockStart)
+			if r {
+				b.entries[idx].result.Done = true
+				b.entries[idx].result.Err = nil
+			}
+		}
+	}
+}
+
 // MarkUnmarkedEntriesError marks all unmarked entries as error.
 func (b *WriteBatch) MarkUnmarkedEntriesError(err error) {
 	for idx := range b.entries {
diff --git a/src/dbnode/storage/series/lookup/entry.go b/src/dbnode/storage/series/lookup/entry.go
index a1319af9c7..31553d7176 100644
--- a/src/dbnode/storage/series/lookup/entry.go
+++ b/src/dbnode/storage/series/lookup/entry.go
@@ -102,8 +102,8 @@ func NewEntry(opts NewEntryOptions) *Entry {
 		indexWriter:              opts.IndexWriter,
 		nowFn:                    nowFn,
 		pendingIndexBatchSizeOne: make([]writes.PendingIndexInsert, 1),
+		reverseIndex:             newEntryIndexState(),
 	}
-	entry.reverseIndex.states = entry.reverseIndex._staticAloc[:0]
 	return entry
 }
 
@@ -202,6 +202,45 @@ func (entry *Entry) OnIndexFinalize(blockStartNanos xtime.UnixNano) {
 	entry.DecrementReaderWriterCount()
 }
 
+func (entry *Entry) IfAlreadyIndexedMarkIndexSuccessAndFinalize(
+	blockStart xtime.UnixNano,
+) bool {
+	successAlready := false
+	entry.reverseIndex.Lock()
+	for _, state := range entry.reverseIndex.states {
+		if state.success {
+			successAlready = true
+			break
+		}
+	}
+	if successAlready {
+		entry.reverseIndex.setSuccessWithWLock(blockStart)
+		entry.reverseIndex.setAttemptWithWLock(blockStart, false)
+	}
+	entry.reverseIndex.Unlock()
+	// indicate the index has released held reference for provided write
+	entry.DecrementReaderWriterCount()
+	return successAlready
+}
+
+func (entry *Entry) RemoveIndexedForBlockStarts(
+	blockStarts map[xtime.UnixNano]struct{},
+) index.RemoveIndexedForBlockStartsResult {
+	var result index.RemoveIndexedForBlockStartsResult
+	entry.reverseIndex.Lock()
+	for k, state := range entry.reverseIndex.states {
+		_, ok := blockStarts[k]
+		if ok && state.success {
+			delete(entry.reverseIndex.states, k)
+			result.IndexedBlockStartsRemoved++
+			continue
+		}
+		result.IndexedBlockStartsRemaining++
+	}
+	entry.reverseIndex.Unlock()
+	return result
+}
+
 // Write writes a new value.
 func (entry *Entry) Write(
 	ctx context.Context,
@@ -268,96 +307,64 @@ func (entry *Entry) maybeIndex(timestamp time.Time) error {
 // have a write for the 12-2p block from the 2-4p block, or we'd drop the late write.
 type entryIndexState struct {
 	sync.RWMutex
-	states []entryIndexBlockState
-
-	// NB(prateek): we alloc an array (not slice) of size 3, as that is
-	// the most we will need (only 3 blocks should ever be written to
-	// simultaneously in the worst case). We allocate it like we're doing
-	// to ensure it's along side the rest of the struct in memory. But
-	// we only access it through `states`, to ensure that it can be
-	// grown/shrunk as needed. Do not acccess it directly.
-	_staticAloc [3]entryIndexBlockState
+	states map[xtime.UnixNano]entryIndexBlockState
 }
 
 // entryIndexBlockState is used to capture the state of indexing for a single shard
 // entry for a given index block start. It's used to prevent attempts at double indexing
 // for the same block start.
 type entryIndexBlockState struct {
-	blockStart xtime.UnixNano
-	attempt    bool
-	success    bool
+	attempt bool
+	success bool
+}
+
+func newEntryIndexState() entryIndexState {
+	return entryIndexState{
+		states: make(map[xtime.UnixNano]entryIndexBlockState, 4),
+	}
 }
 
 func (s *entryIndexState) indexedWithRLock(t xtime.UnixNano) bool {
-	for i := range s.states {
-		if s.states[i].blockStart.Equal(t) {
-			return s.states[i].success
-		}
+	v, ok := s.states[t]
+	if ok {
+		return v.success
 	}
 	return false
 }
 
 func (s *entryIndexState) indexedOrAttemptedWithRLock(t xtime.UnixNano) bool {
-	for i := range s.states {
-		if s.states[i].blockStart.Equal(t) {
-			return s.states[i].success || s.states[i].attempt
-		}
+	v, ok := s.states[t]
+	if ok {
+		return v.success || v.attempt
 	}
 	return false
 }
 
 func (s *entryIndexState) setSuccessWithWLock(t xtime.UnixNano) {
-	for i := range s.states {
-		if s.states[i].blockStart.Equal(t) {
-			s.states[i].success = true
-			return
-		}
+	if s.indexedWithRLock(t) {
+		return
 	}
 
 	// NB(r): If not inserted state yet that means we need to make an insertion,
 	// this will happen if synchronously indexing and we haven't called
 	// NeedIndexUpdate before we indexed the series.
-	s.insertBlockState(entryIndexBlockState{
-		blockStart: t,
-		success:    true,
-	})
+	s.states[t] = entryIndexBlockState{
+		success: true,
+	}
 }
 
 func (s *entryIndexState) setAttemptWithWLock(t xtime.UnixNano, attempt bool) {
-	// first check if we have the block start in the slice already
-	for i := range s.states {
-		if s.states[i].blockStart.Equal(t) {
-			s.states[i].attempt = attempt
-			return
+	v, ok := s.states[t]
+	if ok {
+		if v.success {
+			return // Attempt is not relevant if success.
 		}
-	}
-
-	s.insertBlockState(entryIndexBlockState{
-		blockStart: t,
-		attempt:    attempt,
-	})
-}
-
-func (s *entryIndexState) insertBlockState(newState entryIndexBlockState) {
-	// i.e. we don't have the block start in the slice
-	// if we have less than 3 elements, we can just insert an element to the slice.
-	if len(s.states) < 3 {
-		s.states = append(s.states, newState)
+		v.attempt = attempt
+		s.states[t] = v
 		return
 	}
 
-	// i.e. len(s.states) == 3, in this case, we update the entry with the lowest block start
-	// as we know only 3 writes can be active at any point. Think of this as a lazy compaction.
-	var (
-		minIdx        = -1
-		minBlockStart = xtime.UnixNano(maxInt64)
-	)
-	for idx, blockState := range s.states {
-		if blockState.blockStart < minBlockStart {
-			minIdx = idx
-			minBlockStart = blockState.blockStart
-		}
+	s.states[t] = entryIndexBlockState{
+		attempt: attempt,
 	}
-
-	s.states[minIdx] = newState
 }
diff --git a/src/m3ninx/doc/document.go b/src/m3ninx/doc/document.go
index 164c0f210e..12ec86a703 100644
--- a/src/m3ninx/doc/document.go
+++ b/src/m3ninx/doc/document.go
@@ -91,6 +91,10 @@ func (f Fields) shallowCopy() Fields {
 type Document struct {
 	ID     []byte
 	Fields []Field
+
+	// Ref is a general purpose ref to track a related
+	// object to a document.
+	Ref interface{}
 }
 
 // Get returns the value of the specified field name in the document if it exists.
diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go
index 545ebf63ee..bec1f5c951 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_builder.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go
@@ -132,7 +132,7 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 				skip = append(skip, iter.PostingsID())
 				continue
 			}
-			if b.filter != nil && !b.filter.Contains(d.ID) {
+			if b.filter != nil && !b.filter.Contains(d) {
 				// Actively filtering and ID is not contained.
 				skip = append(skip, iter.PostingsID())
 				if b.filterCount != nil {
diff --git a/src/m3ninx/index/segment/types.go b/src/m3ninx/index/segment/types.go
index dd911d5075..8cc9f2f20a 100644
--- a/src/m3ninx/index/segment/types.go
+++ b/src/m3ninx/index/segment/types.go
@@ -232,5 +232,15 @@ type SegmentsBuilder interface {
 
 // DocumentsFilter is a documents filter.
 type DocumentsFilter interface {
-	Contains(id []byte) bool
+	Contains(d doc.Document) bool
+}
+
+// DocumentsFilterFn implements DocumentsFilter.
+type DocumentsFilterFn func(d doc.Document) bool
+
+var _ DocumentsFilter = DocumentsFilterFn(nil)
+
+// Contains implements the DocumentsFilter interface.
+func (f DocumentsFilterFn) Contains(d doc.Document) bool {
+	return f(d)
 }

From 3699014aa81bce15a88aa91e31114fb03823a184 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 8 Feb 2021 01:05:31 -0500
Subject: [PATCH 083/106] Only run background GC when sealed blocks updated

---
 src/dbnode/storage/index/mutable_segments.go  | 37 ++++++++++++++-----
 .../builder/multi_segments_terms_iter.go      |  9 +++++
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index bb88142895..681923b818 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -188,11 +188,21 @@ func (m *mutableSegments) NotifySealedBlocks(
 	}
 
 	m.Lock()
+	updated := false
 	for _, blockStart := range sealed {
+		_, exists := m.sealedBlockStarts[blockStart]
+		if exists {
+			continue
+		}
 		m.sealedBlockStarts[blockStart] = struct{}{}
+		updated = true
+	}
+	if updated {
+		// Only trigger background compact GC if
+		// and only if updated the sealed block starts.
+		m.backgroundCompactGCPending = true
+		m.maybeBackgroundCompactWithLock()
 	}
-	m.backgroundCompactGCPending = true
-	m.maybeBackgroundCompactWithLock()
 	m.Unlock()
 
 	return nil
@@ -445,6 +455,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 	}
 
 	var (
+		gcRequired        = false
 		gcPlan            = &compaction.Plan{}
 		gcAlreadyRunning  = m.compact.compactingBackgroundGarbageCollect
 		sealedBlockStarts = make(map[xtime.UnixNano]struct{}, len(m.sealedBlockStarts))
@@ -455,6 +466,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		sealedBlockStarts[k] = v
 	}
 	if !gcAlreadyRunning && m.backgroundCompactGCPending {
+		gcRequired = true
 		m.backgroundCompactGCPending = false
 
 		for _, seg := range m.backgroundSegments {
@@ -503,7 +515,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 		m.compact.compactingBackgroundStandard = true
 		go func() {
 			m.backgroundCompactWithPlan(plan, m.compact.backgroundCompactors,
-				sealedBlockStarts)
+				gcRequired, sealedBlockStarts)
 
 			m.Lock()
 			m.compact.compactingBackgroundStandard = false
@@ -523,7 +535,7 @@ func (m *mutableSegments) maybeBackgroundCompactWithLock() {
 				})
 			} else {
 				m.backgroundCompactWithPlan(gcPlan, compactors,
-					sealedBlockStarts)
+					gcRequired, sealedBlockStarts)
 				m.closeCompactors(compactors)
 			}
 
@@ -593,6 +605,7 @@ func (m *mutableSegments) closeCompactedSegmentsWithLock(segments []*readableSeg
 func (m *mutableSegments) backgroundCompactWithPlan(
 	plan *compaction.Plan,
 	compactors chan *compaction.Compactor,
+	gcRequired bool,
 	sealedBlocks map[xtime.UnixNano]struct{},
 ) {
 	sw := m.metrics.backgroundCompactionPlanRunLatency.Start()
@@ -629,7 +642,7 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 				compactors <- compactor
 				wg.Done()
 			}()
-			err := m.backgroundCompactWithTask(task, compactor,
+			err := m.backgroundCompactWithTask(task, compactor, gcRequired,
 				sealedBlocks, log, logger.With(zap.Int("task", i)))
 			if err != nil {
 				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
@@ -656,6 +669,7 @@ func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment
 func (m *mutableSegments) backgroundCompactWithTask(
 	task compaction.Task,
 	compactor *compaction.Compactor,
+	gcRequired bool,
 	sealedBlocks map[xtime.UnixNano]struct{},
 	log bool,
 	logger *zap.Logger,
@@ -669,9 +683,10 @@ func (m *mutableSegments) backgroundCompactWithTask(
 		segments = append(segments, seg.Segment)
 	}
 
-	start := time.Now()
-	compacted, err := compactor.Compact(segments,
-		segment.DocumentsFilterFn(func(d doc.Document) bool {
+	var documentsFilter segment.DocumentsFilter
+	if gcRequired {
+		// Only actively filter out documents if GC is required.
+		documentsFilter = segment.DocumentsFilterFn(func(d doc.Document) bool {
 			// Filter out any documents that only were indexed for
 			// sealed blocks.
 			if d.Ref == nil {
@@ -691,7 +706,11 @@ func (m *mutableSegments) backgroundCompactWithTask(
 			// Keep the series if and only if there are remaining
 			// index block starts outside of the sealed blocks starts.
 			return result.IndexedBlockStartsRemaining > 0
-		}),
+		})
+	}
+
+	start := time.Now()
+	compacted, err := compactor.Compact(segments, documentsFilter,
 		m.metrics.activeBlockGarbageCollectSeries,
 		mmap.ReporterOptions{
 			Context: mmap.Context{
diff --git a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
index 5c40542ab7..f05a2fb203 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
@@ -161,10 +161,19 @@ func (i *termsIterFromSegments) Next() bool {
 		)
 		for iter.Next() {
 			curr := iter.Current()
+			factor := 2
+			// First do exponential skipping.
+			for len(skip) >= factor && curr > skip[factor-1] {
+				skip = skip[factor:]
+				negativeOffset += postings.ID(factor)
+				factor *= 2
+			}
+			// Then linear.
 			for len(skip) > 0 && curr > skip[0] {
 				skip = skip[1:]
 				negativeOffset++
 			}
+			// Then skip the individual if matches.
 			if len(skip) > 0 && curr == skip[0] {
 				skip = skip[1:]
 				negativeOffset++

From 4934faf05fbebf2adbd38d7e41921dc65f99cbfd Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 8 Feb 2021 01:11:18 -0500
Subject: [PATCH 084/106] Remove purge expired series warning

---
 src/dbnode/storage/shard.go   | 3 ++-
 src/x/instrument/invariant.go | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go
index 029a8b9049..8f4008e0ff 100644
--- a/src/dbnode/storage/shard.go
+++ b/src/dbnode/storage/shard.go
@@ -859,7 +859,8 @@ func (s *dbShard) purgeExpiredSeries(expiredEntries []*lookup.Entry) {
 		count := entry.ReaderWriterCount()
 		// The contract requires all entries to have count >= 1.
 		if count < 1 {
-			s.logger.Error("purgeExpiredSeries encountered invalid series read/write count",
+			instrument.EmitInvariantViolation(s.opts.InstrumentOptions())
+			s.logger.Debug("purgeExpiredSeries encountered invalid series read/write count",
 				zap.String("series", series.ID().String()),
 				zap.Int32("readerWriterCount", count))
 			continue
diff --git a/src/x/instrument/invariant.go b/src/x/instrument/invariant.go
index baae4be76a..924e4e6dd5 100644
--- a/src/x/instrument/invariant.go
+++ b/src/x/instrument/invariant.go
@@ -60,6 +60,7 @@ func EmitInvariantViolation(opts Options) {
 	// NB(prateek): there's no need to cache this metric. It should be never
 	// be called in production systems unless something is seriously messed
 	// up. At which point, the extra map alloc should be of no concern.
+	// TODO: Require a "name" tag here to add to the metric with "error_type".
 	opts.MetricsScope().Counter(InvariantViolatedMetricName).Inc(1)
 
 	panicIfEnvSet()

From 55632a971812239bde959e4650411e53011d1d82 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 9 Feb 2021 02:26:24 -0500
Subject: [PATCH 085/106] Use o(1) lookup for negative offsets for postings IDs
 in multisegments builder

---
 .../segment/builder/multi_segments_builder.go | 50 ++++++++++++-------
 ...i_segments_multi_key_postings_list_iter.go | 21 +++-----
 .../builder/multi_segments_terms_iter.go      | 28 +++--------
 3 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go
index bec1f5c951..a0f788b31e 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_builder.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go
@@ -46,12 +46,15 @@ type builderFromSegments struct {
 type segmentMetadata struct {
 	segment segment.Segment
 	offset  postings.ID
-	// skipAsc is a lookup of document IDs are duplicates or
-	// to filter out in this segment, that is documents that are already
-	// contained by other segments or should not be included
-	// in the output segment and hence should not be
-	// returned when looking up documents.
-	skipAsc []postings.ID
+	// negativeOffsets is a lookup of document IDs are duplicates or should be skipped,
+	// that is documents that are already contained by other segments or should
+	// not be included in the output segment and hence should not be returned
+	// when looking up documents. If this is the case offset is -1.
+	// If a document ID is not a duplicate or skipped then the offset is
+	// the shift that should be applied when translating this postings ID
+	// to the result postings ID.
+	negativeOffsets []int64
+	skips           int64
 }
 
 // NewBuilderFromSegments returns a new builder from segments.
@@ -79,7 +82,10 @@ func (b *builderFromSegments) Reset() {
 	b.segmentsOffset = 0
 	var emptySegment segmentMetadata
 	for i := range b.segments {
+		// Save the offsets array.
+		negativeOffsets := b.segments[i].negativeOffsets
 		b.segments[i] = emptySegment
+		b.segments[i].negativeOffsets = negativeOffsets[:0]
 	}
 	b.segments = b.segments[:0]
 
@@ -121,20 +127,32 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 			return err
 		}
 
+		var negativeOffsets []int64
+		if n := len(b.segments); cap(b.segments) > n {
+			// Take the offsets from the element we're about to reuse.
+			negativeOffsets = b.segments[:n+1][n].negativeOffsets[:0]
+		}
+		if int64(cap(negativeOffsets)) < segment.Size() {
+			negativeOffsets = make([]int64, 0, int(1.5*float64(segment.Size())))
+		}
+
 		var (
-			added int
-			skip  []postings.ID
+			added      int
+			currOffset int64
 		)
 		for iter.Next() {
 			d := iter.Current()
+			negativeOffsets = append(negativeOffsets, currOffset)
 			if b.idSet.Contains(d.ID) {
 				// Skip duplicates.
-				skip = append(skip, iter.PostingsID())
+				negativeOffsets[len(negativeOffsets)-1] = -1
+				currOffset++
 				continue
 			}
 			if b.filter != nil && !b.filter.Contains(d) {
 				// Actively filtering and ID is not contained.
-				skip = append(skip, iter.PostingsID())
+				negativeOffsets[len(negativeOffsets)-1] = -1
+				currOffset++
 				if b.filterCount != nil {
 					b.filterCount.Inc(1)
 				}
@@ -153,15 +171,11 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 			return err
 		}
 
-		// Sort duplicates in ascending order
-		sort.Slice(skip, func(i, j int) bool {
-			return skip[i] < skip[j]
-		})
-
 		b.segments = append(b.segments, segmentMetadata{
-			segment: segment,
-			offset:  b.segmentsOffset,
-			skipAsc: skip,
+			segment:         segment,
+			offset:          b.segmentsOffset,
+			negativeOffsets: negativeOffsets,
+			skips:           currOffset,
 		})
 		b.segmentsOffset += postings.ID(added)
 	}
diff --git a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
index d04dc27e78..49453cc310 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_multi_key_postings_list_iter.go
@@ -147,7 +147,7 @@ func (i *multiKeyPostingsListIterator) Next() bool {
 			return false
 		}
 
-		if fieldsKeyIter.segment.offset == 0 && len(fieldsKeyIter.segment.skipAsc) == 0 {
+		if fieldsKeyIter.segment.offset == 0 && fieldsKeyIter.segment.skips == 0 {
 			// No offset, which means is first segment we are combining from
 			// so can just direct union.
 			// Make sure skipAsc is empty otherwise we need to do filtering.
@@ -167,23 +167,18 @@ func (i *multiKeyPostingsListIterator) Next() bool {
 
 		// We have to taken into account the offset and duplicates
 		var (
-			iter           = pl.Iterator()
-			skip           = fieldsKeyIter.segment.skipAsc
-			negativeOffset postings.ID
+			iter            = pl.Iterator()
+			negativeOffsets = fieldsKeyIter.segment.negativeOffsets
 		)
 		for iter.Next() {
 			curr := iter.Current()
-			for len(skip) > 0 && curr > skip[0] {
-				skip = skip[1:]
-				negativeOffset++
-			}
-			if len(skip) > 0 && curr == skip[0] {
-				skip = skip[1:]
-				negativeOffset++
-				// Also skip this value, as itself is a duplicate
+			negativeOffset := negativeOffsets[curr]
+			// Then skip the individual if matches.
+			if negativeOffset == -1 {
+				// Skip this value, as itself is a duplicate.
 				continue
 			}
-			value := curr + fieldsKeyIter.segment.offset - negativeOffset
+			value := curr + fieldsKeyIter.segment.offset - postings.ID(negativeOffset)
 			if err := i.currFieldPostingsList.Insert(value); err != nil {
 				iter.Close()
 				i.err = err
diff --git a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
index f05a2fb203..86b1b03046 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
@@ -136,7 +136,7 @@ func (i *termsIterFromSegments) Next() bool {
 		termsKeyIter := iter.(*termsKeyIter)
 		_, list := termsKeyIter.iter.Current()
 
-		if termsKeyIter.segment.offset == 0 && len(termsKeyIter.segment.skipAsc) == 0 {
+		if termsKeyIter.segment.offset == 0 && termsKeyIter.segment.skips == 0 {
 			// No offset, which means is first segment we are combining from
 			// so can just direct union.
 			if index.MigrationReadOnlyPostings() {
@@ -155,32 +155,18 @@ func (i *termsIterFromSegments) Next() bool {
 
 		// We have to taken into account the offset and duplicates
 		var (
-			iter           = list.Iterator()
-			skip           = termsKeyIter.segment.skipAsc
-			negativeOffset postings.ID
+			iter            = list.Iterator()
+			negativeOffsets = termsKeyIter.segment.negativeOffsets
 		)
 		for iter.Next() {
 			curr := iter.Current()
-			factor := 2
-			// First do exponential skipping.
-			for len(skip) >= factor && curr > skip[factor-1] {
-				skip = skip[factor:]
-				negativeOffset += postings.ID(factor)
-				factor *= 2
-			}
-			// Then linear.
-			for len(skip) > 0 && curr > skip[0] {
-				skip = skip[1:]
-				negativeOffset++
-			}
+			negativeOffset := negativeOffsets[curr]
 			// Then skip the individual if matches.
-			if len(skip) > 0 && curr == skip[0] {
-				skip = skip[1:]
-				negativeOffset++
-				// Also skip this value, as itself is a duplicate
+			if negativeOffset == -1 {
+				// Skip this value, as itself is a duplicate.
 				continue
 			}
-			value := curr + termsKeyIter.segment.offset - negativeOffset
+			value := curr + termsKeyIter.segment.offset - postings.ID(negativeOffset)
 			if err := i.currPostingsList.Insert(value); err != nil {
 				iter.Close()
 				i.err = err

From 3b8c1d01f9d49fe70e5ca372685cd8616828967b Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 10 Feb 2021 17:57:29 -0500
Subject: [PATCH 086/106] Load term FSTs up front to avoid slow queries during
 compactions

---
 src/m3ninx/index/segment/fst/segment.go | 119 ++++++++++--------------
 1 file changed, 51 insertions(+), 68 deletions(-)

diff --git a/src/m3ninx/index/segment/fst/segment.go b/src/m3ninx/index/segment/fst/segment.go
index 39edfea328..0302f0624b 100644
--- a/src/m3ninx/index/segment/fst/segment.go
+++ b/src/m3ninx/index/segment/fst/segment.go
@@ -156,6 +156,43 @@ func NewSegment(data SegmentData, opts Options) (Segment, error) {
 		numDocs:  metadata.NumDocs,
 	}
 
+	// Preload all the term FSTs so that there's no locking
+	// required (which was causing lock contention with queries requiring
+	// access to the terms FST for a field that hasn't been accessed before
+	// and loading on demand).
+	iter := newFSTTermsIter()
+	iter.reset(fstTermsIterOpts{
+		seg:         s,
+		fst:         fieldsFST,
+		finalizeFST: false,
+	})
+
+	iterCloser := x.NewSafeCloser(iter)
+	defer func() { _ = iterCloser.Close() }()
+
+	for iter.Next() {
+		field := iter.Current()
+		termsFSTOffset := iter.CurrentOffset()
+		termsFSTBytes, err := s.retrieveBytesWithRLock(s.data.FSTTermsData.Bytes, termsFSTOffset)
+		if err != nil {
+			return nil, fmt.Errorf(
+				"error while decoding terms fst: field=%s, err=%v", field, err)
+		}
+
+		termsFST, err := vellum.Load(termsFSTBytes)
+		if err != nil {
+			return nil, fmt.Errorf(
+				"error while loading terms fst: field=%s, err=%v", field, err)
+		}
+
+		// Save FST to FST map.
+		vellumFST := newVellumFST(termsFST)
+		s.termFSTs.fstMap.Set(field, vellumFST)
+	}
+	if err := iterCloser.Close(); err != nil {
+		return nil, err
+	}
+
 	// NB(r): The segment uses the context finalization to finalize
 	// resources. Finalize is called after Close is called and all
 	// the segment readers have also been closed.
@@ -186,7 +223,6 @@ type fsSegment struct {
 }
 
 type vellumFSTs struct {
-	sync.RWMutex
 	fstMap     *fstMap
 	readerPool *fstReaderPool
 }
@@ -270,17 +306,14 @@ func (r *fsSegment) ContainsID(docID []byte) (bool, error) {
 		return false, errReaderClosed
 	}
 
-	termsFST, exists, err := r.retrieveTermsFSTWithRLock(doc.IDReservedFieldName)
-	if err != nil {
-		return false, err
-	}
-
+	termsFST, exists := r.retrieveTermsFSTWithRLock(doc.IDReservedFieldName)
 	if !exists {
-		return false, fmt.Errorf("internal error while retrieving id FST: %v", err)
+		return false, fmt.Errorf(
+			"internal error while retrieving id FST: %s",
+			doc.IDReservedFieldName)
 	}
 
-	_, exists, err = termsFST.Get(docID)
-
+	_, exists, err := termsFST.Get(docID)
 	return exists, err
 }
 
@@ -332,12 +365,10 @@ func (r *fsSegment) Finalize() {
 
 	r.finalized = true
 
-	r.termFSTs.Lock()
 	for _, elem := range r.termFSTs.fstMap.Iter() {
 		vellumFST := elem.Value()
 		vellumFST.fst.Close()
 	}
-	r.termFSTs.Unlock()
 
 	r.fieldsFST.Close()
 
@@ -455,11 +486,7 @@ func (i *termsIterable) termsNotClosedMaybeFinalizedWithRLock(
 		return nil, errReaderFinalized
 	}
 
-	termsFST, exists, err := i.r.retrieveTermsFSTWithRLock(field)
-	if err != nil {
-		return nil, err
-	}
-
+	termsFST, exists := i.r.retrieveTermsFSTWithRLock(field)
 	if !exists {
 		return sgmt.EmptyTermsIterator, nil
 	}
@@ -599,11 +626,7 @@ func (r *fsSegment) matchTermNotClosedMaybeFinalizedWithRLock(
 		return nil, errReaderFinalized
 	}
 
-	termsFST, exists, err := r.retrieveTermsFSTWithRLock(field)
-	if err != nil {
-		return nil, err
-	}
-
+	termsFST, exists := r.retrieveTermsFSTWithRLock(field)
 	if !exists {
 		// i.e. we don't know anything about the field, so can early return an empty postings list
 		if index.MigrationReadOnlyPostings() {
@@ -692,11 +715,7 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 		return nil, errReaderNilRegexp
 	}
 
-	termsFST, exists, err := r.retrieveTermsFSTWithRLock(field)
-	if err != nil {
-		return nil, err
-	}
-
+	termsFST, exists := r.retrieveTermsFSTWithRLock(field)
 	if !exists {
 		// i.e. we don't know anything about the field, so can early return an empty postings list
 		if index.MigrationReadOnlyPostings() {
@@ -734,7 +753,10 @@ func (r *fsSegment) matchRegexpNotClosedMaybeFinalizedWithRLock(
 		iterErr = searcher.iter.Next()
 	}
 
-	var pl postings.List
+	var (
+		pl  postings.List
+		err error
+	)
 	if index.MigrationReadOnlyPostings() {
 		// Perform a lazy fast union.
 		pl, err = roaring.UnionReadOnly(searcher.pls)
@@ -835,47 +857,8 @@ func (r *fsSegment) retrievePostingsListWithRLock(postingsOffset uint64) (postin
 	return pilosa.Unmarshal(postingsBytes)
 }
 
-func (r *fsSegment) retrieveTermsFSTWithRLock(field []byte) (vellumFST, bool, error) {
-	r.termFSTs.RLock()
-	fst, ok := r.termFSTs.fstMap.Get(field)
-	r.termFSTs.RUnlock()
-	if ok {
-		return fst, true, nil
-	}
-
-	r.termFSTs.Lock()
-	defer r.termFSTs.Unlock()
-
-	fst, ok = r.termFSTs.fstMap.Get(field)
-	if ok {
-		return fst, true, nil
-	}
-
-	termsFSTOffset, exists, err := r.fieldsFST.Get(field)
-	if err != nil {
-		return vellumFST{}, false, err
-	}
-
-	if !exists {
-		return vellumFST{}, false, nil
-	}
-
-	termsFSTBytes, err := r.retrieveBytesWithRLock(r.data.FSTTermsData.Bytes, termsFSTOffset)
-	if err != nil {
-		return vellumFST{}, false, fmt.Errorf("error while decoding terms fst: %v", err)
-	}
-
-	termsFST, err := vellum.Load(termsFSTBytes)
-	if err != nil {
-		return vellumFST{}, false, fmt.Errorf("error while loading terms fst: %v", err)
-	}
-
-	// Save FST to FST map.
-	vellumFST := newVellumFST(termsFST)
-	r.termFSTs.fstMap.Set(field, vellumFST)
-
-	// Return result.
-	return vellumFST, true, nil
+func (r *fsSegment) retrieveTermsFSTWithRLock(field []byte) (vellumFST, bool) {
+	return r.termFSTs.fstMap.Get(field)
 }
 
 // retrieveTermsBytesWithRLock assumes the base []byte slice is a collection of

From 1c866394048d174cd9218eb861011ac37dddc1af Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 25 Feb 2021 22:17:27 -0500
Subject: [PATCH 087/106] Fix CI tests

---
 .../index_multiple_block_query_test.go        | 60 ++++++++++++++++---
 src/dbnode/storage/index.go                   |  6 ++
 src/dbnode/storage/index/mutable_segments.go  | 21 ++-----
 3 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/src/dbnode/integration/index_multiple_block_query_test.go b/src/dbnode/integration/index_multiple_block_query_test.go
index b12c3bcf38..ecf864e3b4 100644
--- a/src/dbnode/integration/index_multiple_block_query_test.go
+++ b/src/dbnode/integration/index_multiple_block_query_test.go
@@ -23,17 +23,22 @@
 package integration
 
 import (
+	"strings"
 	"testing"
 	"time"
 
+	"github.com/stretchr/testify/require"
+	"github.com/uber-go/tally"
+	"go.uber.org/zap"
+
 	"github.com/m3db/m3/src/dbnode/namespace"
+	"github.com/m3db/m3/src/dbnode/persist/fs"
 	"github.com/m3db/m3/src/dbnode/retention"
 	"github.com/m3db/m3/src/dbnode/storage/index"
+	xmetrics "github.com/m3db/m3/src/dbnode/x/metrics"
 	"github.com/m3db/m3/src/m3ninx/idx"
 	xclock "github.com/m3db/m3/src/x/clock"
-
-	"github.com/stretchr/testify/require"
-	"go.uber.org/zap"
+	"github.com/m3db/m3/src/x/instrument"
 )
 
 /*
@@ -57,9 +62,10 @@ func TestIndexMultipleBlockQuery(t *testing.T) {
 		indexBlockSize  = time.Hour
 		bufferFuture    = 5 * time.Minute
 		bufferPast      = 10 * time.Minute
+		verifyTimeout   = 2 * time.Minute
 	)
 
-	// Test setup
+	// Test setup.
 	md, err := namespace.NewMetadata(testNamespaces[0],
 		namespace.NewOptions().
 			SetRetentionOptions(
@@ -80,6 +86,13 @@ func TestIndexMultipleBlockQuery(t *testing.T) {
 	require.NoError(t, err)
 	defer testSetup.Close()
 
+	reporter := xmetrics.NewTestStatsReporter(xmetrics.NewTestStatsReporterOptions())
+	scope, closer := tally.NewRootScope(
+		tally.ScopeOptions{Reporter: reporter}, time.Millisecond)
+	defer closer.Close()
+	testSetup.SetStorageOpts(testSetup.StorageOpts().SetInstrumentOptions(
+		instrument.NewOptions().SetMetricsScope(scope)))
+
 	t0 := time.Date(2018, time.May, 6, 12, 50, 0, 0, time.UTC)
 	t1 := t0.Add(10 * time.Minute)
 	t2 := t1.Add(5 * time.Minute)
@@ -88,11 +101,11 @@ func TestIndexMultipleBlockQuery(t *testing.T) {
 	writesPeriod0 := GenerateTestIndexWrite(0, numWrites, numTags, t0, t1)
 	writesPeriod1 := GenerateTestIndexWrite(1, numWrites, numTags, t1, t2)
 
-	// Start the server
+	// Start the server.
 	log := testSetup.StorageOpts().InstrumentOptions().Logger()
 	require.NoError(t, testSetup.StartServer())
 
-	// Stop the server
+	// Stop the server.
 	defer func() {
 		require.NoError(t, testSetup.StopServer())
 		log.Debug("server is now down")
@@ -118,7 +131,40 @@ func TestIndexMultipleBlockQuery(t *testing.T) {
 	require.True(t, indexed)
 	log.Info("verified data is indexed", zap.Duration("took", time.Since(start)))
 
-	// "shared":"shared", is a common tag across all written metrics
+	// Progress and flush so that data gets evicted from in-memory block
+	// that no longer needs to be there.
+	testSetup.SetNowFn(t2.Add(indexBlockSize).Add(bufferPast))
+	// Now wait for a flush so that in memory data is empty.
+	log.Info("waiting till filesets found on disk")
+	found := xclock.WaitUntil(func() bool {
+		filesets, err := fs.IndexFileSetsAt(testSetup.FilePathPrefix(), md.ID(), t1)
+		require.NoError(t, err)
+		return len(filesets) == 1
+	}, verifyTimeout)
+	require.True(t, found)
+	log.Info("found filesets found on disk")
+	// Ensure we've evicted the mutable segments.
+	log.Info("waiting till notify sealed blocks")
+	evicted := xclock.WaitUntil(func() bool {
+		counters := reporter.Counters()
+		counter, ok := counters["dbindex.blocks-notify-sealed"]
+		return ok && counter > 10
+	}, verifyTimeout)
+	require.True(t, evicted)
+	log.Info("notify sealed blocks complete")
+
+	for {
+		log.Info("reporting metrics")
+		for k, v := range reporter.Counters() {
+			if strings.Contains(k, "active") || strings.Contains(k, "notify") || strings.Contains(k, "tick") {
+				log.Info("metric reported", zap.String("k", k), zap.Int64("k", v))
+			}
+		}
+		// time.Sleep(5 * time.Second)
+		break
+	}
+
+	// "shared":"shared", is a common tag across all written metrics.
 	query := index.Query{
 		Query: idx.NewTermQuery([]byte("shared"), []byte("shared"))}
 
diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 1c3cad3609..0d3fb15c56 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -942,6 +942,8 @@ func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceInd
 		// and is an expensive task that doesn't require
 		// holding the index lock.
 		_ = activeBlock.InMemoryBlockNotifySealedBlocks(sealedBlocks)
+		i.metrics.blocksNotifySealed.Inc(int64(len(sealedBlocks)))
+		i.metrics.tick.Inc(1)
 	}()
 
 	earliestBlockStartToRetain := i.earliestBlockStartToRetainWithLock(startTime)
@@ -2286,6 +2288,7 @@ type nsIndexMetrics struct {
 	asyncInsertAttemptSkip  tally.Counter
 	asyncInsertAttemptWrite tally.Counter
 
+	tick                             tally.Counter
 	asyncInsertSuccess               tally.Counter
 	asyncInsertErrors                tally.Counter
 	insertAfterClose                 tally.Counter
@@ -2295,6 +2298,7 @@ type nsIndexMetrics struct {
 	forwardIndexCounter              tally.Counter
 	insertEndToEndLatency            tally.Timer
 	blocksEvictedMutableSegments     tally.Counter
+	blocksNotifySealed               tally.Counter
 	blockMetrics                     nsIndexBlocksMetrics
 	indexingConcurrencyMin           tally.Gauge
 	indexingConcurrencyMax           tally.Gauge
@@ -2330,6 +2334,7 @@ func newNamespaceIndexMetrics(
 	scope := iopts.MetricsScope()
 	blocksScope := scope.SubScope("blocks")
 	m := nsIndexMetrics{
+		tick: scope.Counter("index-tick"),
 		asyncInsertAttemptTotal: scope.Tagged(map[string]string{
 			"stage": "process",
 		}).Counter(indexAttemptName),
@@ -2361,6 +2366,7 @@ func newNamespaceIndexMetrics(
 		insertEndToEndLatency: instrument.NewTimer(scope,
 			"insert-end-to-end-latency", iopts.TimerOptions()),
 		blocksEvictedMutableSegments: scope.Counter("blocks-evicted-mutable-segments"),
+		blocksNotifySealed:           scope.Counter("blocks-notify-sealed"),
 		blockMetrics:                 newNamespaceIndexBlocksMetrics(opts, blocksScope),
 		indexingConcurrencyMin: scope.Tagged(map[string]string{
 			"stat": "min",
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 681923b818..4283095dd6 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -118,10 +118,6 @@ type mutableSegmentsMetrics struct {
 	backgroundCompactionPlanRunLatency tally.Timer
 	backgroundCompactionTaskRunLatency tally.Timer
 	activeBlockIndexNew                tally.Counter
-	activeBlockIndexExists             tally.Counter
-	activeBlockBloomNew                tally.Counter
-	activeBlockBloomExists             tally.Counter
-	activeBlockBloomUpdate             tally.Counter
 	activeBlockGarbageCollectSegment   tally.Counter
 	activeBlockGarbageCollectSeries    tally.Counter
 }
@@ -138,18 +134,6 @@ func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
 		activeBlockIndexNew: activeBlockScope.Tagged(map[string]string{
 			"result_type": "new",
 		}).Counter("index-result"),
-		activeBlockIndexExists: activeBlockScope.Tagged(map[string]string{
-			"result_type": "exists",
-		}).Counter("index-result"),
-		activeBlockBloomNew: activeBlockScope.Tagged(map[string]string{
-			"result_type": "new",
-		}).Counter("bloom-result"),
-		activeBlockBloomExists: activeBlockScope.Tagged(map[string]string{
-			"result_type": "exists",
-		}).Counter("bloom-result"),
-		activeBlockBloomUpdate: activeBlockScope.Tagged(map[string]string{
-			"result_type": "update",
-		}).Counter("bloom-result"),
 		activeBlockGarbageCollectSegment: activeBlockScope.Counter("gc-segment"),
 		activeBlockGarbageCollectSeries:  activeBlockScope.Counter("gc-series"),
 	}
@@ -273,7 +257,8 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 		Docs:                docs,
 		AllowPartialUpdates: true,
 	})
-	if len(segmentBuilder.Docs()) == 0 {
+	n := len(segmentBuilder.Docs())
+	if n == 0 {
 		// No inserts, no need to compact.
 		return MutableSegmentsStats{}, insertResultErr
 	}
@@ -286,6 +271,8 @@ func (m *mutableSegments) WriteBatch(inserts *WriteBatch) (MutableSegmentsStats,
 		return MutableSegmentsStats{}, err
 	}
 
+	m.metrics.activeBlockIndexNew.Inc(int64(n))
+
 	// Return result from the original insertion since compaction was successful.
 	return result, insertResultErr
 }

From 33590cc9cc6d98b105162965c9610607a4eced03 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 26 Feb 2021 16:09:49 -0500
Subject: [PATCH 088/106] Always consider the in-memory block

---
 src/dbnode/integration/setup.go | 3 ++-
 src/dbnode/storage/index.go     | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/dbnode/integration/setup.go b/src/dbnode/integration/setup.go
index 95f921b17c..107dc48af6 100644
--- a/src/dbnode/integration/setup.go
+++ b/src/dbnode/integration/setup.go
@@ -287,7 +287,8 @@ func NewTestSetup(
 
 	indexOpts := storageOpts.IndexOptions().
 		SetInsertMode(indexMode).
-		SetPostingsListCache(plCache)
+		SetPostingsListCache(plCache).
+		SetInstrumentOptions(storageOpts.InstrumentOptions())
 	storageOpts = storageOpts.SetIndexOptions(indexOpts)
 
 	runtimeOptsMgr := storageOpts.RuntimeOptionsManager()
diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 0d3fb15c56..0644d7f683 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -2538,9 +2538,6 @@ func newBlocksIterStackAlloc(
 
 func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) {
 	iter := i
-	if i.queryRanges.IsEmpty() {
-		return iter, false
-	}
 
 	for {
 		iter.idx++
@@ -2549,6 +2546,12 @@ func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) {
 			return iter, true
 		}
 
+		// No more ranges to query, perform this second so that
+		// the in memory block always returns results.
+		if i.queryRanges.IsEmpty() {
+			return iter, false
+		}
+
 		if iter.idx >= len(i.blocks) {
 			return iter, false
 		}

From 6a0aa102d22f5d6c9f637611a9f3323af8649954 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 8 Mar 2021 15:58:46 -0500
Subject: [PATCH 089/106] Filter instead of shrink slice

---
 src/dbnode/storage/index.go       | 16 +++++++---------
 src/dbnode/storage/index/types.go | 10 ++++------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 0644d7f683..2fc5317fbc 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -655,17 +655,15 @@ func (i *nsIndex) WritePending(
 	pending []writes.PendingIndexInsert,
 ) error {
 	// Filter anything with a pending index out before acquiring lock.
-	for j := 0; j < len(pending); j++ {
-		t := xtime.ToUnixNano(pending[j].Entry.Timestamp.Truncate(i.blockSize))
-		if !pending[j].Entry.OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(t) {
+	incoming := pending
+	pending = pending[:0]
+	for j := range incoming {
+		t := xtime.ToUnixNano(incoming[j].Entry.Timestamp.Truncate(i.blockSize))
+		if incoming[j].Entry.OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(t) {
 			continue
 		}
-		// Remove this elem by moving tail here and shrinking by one.
-		n := len(pending)
-		pending[j] = pending[n-1]
-		pending = pending[:n-1]
-		// Reprocess element.
-		j--
+		// Continue to add this element.
+		pending = append(pending, incoming[j])
 	}
 	if len(pending) == 0 {
 		return nil
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index d2168aab3d..11eee06dc0 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -355,6 +355,8 @@ type OnIndexSeries interface {
 	) RemoveIndexedForBlockStartsResult
 }
 
+// RemoveIndexedForBlockStartsResult is the result from calling
+// RemoveIndexedForBlockStarts.
 type RemoveIndexedForBlockStartsResult struct {
 	IndexedBlockStartsRemoved   int
 	IndexedBlockStartsRemaining int
@@ -714,13 +716,9 @@ func (b *WriteBatch) ForEachUnmarkedBatchByBlockStart(
 	}
 }
 
+// PendingAny returns whether there are any pending documents to be inserted.
 func (b *WriteBatch) PendingAny() bool {
-	for i := range b.entries {
-		if !b.entries[i].result.Done {
-			return true
-		}
-	}
-	return false
+	return len(b.PendingDocs()) > 0
 }
 
 func (b *WriteBatch) numPending() int {

From 9421baeab98318a8dada2f66f2158ef33b5373ee Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 10 Mar 2021 15:11:07 -0500
Subject: [PATCH 090/106] Only decrement readerwrite count if marking as
 success

---
 src/dbnode/storage/series/lookup/entry.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/dbnode/storage/series/lookup/entry.go b/src/dbnode/storage/series/lookup/entry.go
index 31553d7176..d99838767a 100644
--- a/src/dbnode/storage/series/lookup/entry.go
+++ b/src/dbnode/storage/series/lookup/entry.go
@@ -218,8 +218,10 @@ func (entry *Entry) IfAlreadyIndexedMarkIndexSuccessAndFinalize(
 		entry.reverseIndex.setAttemptWithWLock(blockStart, false)
 	}
 	entry.reverseIndex.Unlock()
-	// indicate the index has released held reference for provided write
-	entry.DecrementReaderWriterCount()
+	if successAlready {
+		// indicate the index has released held reference for provided write
+		entry.DecrementReaderWriterCount()
+	}
 	return successAlready
 }
 

From daed4916b7ed6cee8fa75188ff51e9bac7374196 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 12 Mar 2021 15:39:54 -0500
Subject: [PATCH 091/106] Always re-resolve entry when checking if should GC
 series

---
 src/dbnode/storage/index/mutable_segments.go | 16 ++++++--
 src/dbnode/storage/index/types.go            |  4 ++
 src/dbnode/storage/series/lookup/entry.go    | 42 ++++++++++++--------
 src/dbnode/storage/shard.go                  |  7 ++++
 4 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 4283095dd6..a7c56aec08 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -23,7 +23,6 @@ package index
 import (
 	"errors"
 	"fmt"
-	"github.com/m3db/m3/src/m3ninx/doc"
 	"math"
 	"runtime"
 	"sync"
@@ -32,6 +31,7 @@ import (
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
 	"github.com/m3db/m3/src/dbnode/storage/index/segments"
+	"github.com/m3db/m3/src/m3ninx/doc"
 	m3ninxindex "github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/index/segment/builder"
@@ -682,14 +682,24 @@ func (m *mutableSegments) backgroundCompactWithTask(
 				})
 				return true
 			}
-			onIndexSeries, ok := d.Ref.(OnIndexSeries)
+
+			entry, ok := d.Ref.(OnIndexSeries)
 			if !ok {
 				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
 					l.Error("unexpected type for document ref for background compact")
 				})
 				return true
 			}
-			result := onIndexSeries.RemoveIndexedForBlockStarts(sealedBlocks)
+
+			latestEntry, ok := entry.RelookupAndIncrementReaderWriterCount()
+			if !ok {
+				// Entry nolonger valid in shard.
+				return false
+			}
+
+			result := latestEntry.RemoveIndexedForBlockStarts(sealedBlocks)
+			latestEntry.DecrementReaderWriterCount()
+
 			// Keep the series if and only if there are remaining
 			// index block starts outside of the sealed blocks starts.
 			return result.IndexedBlockStartsRemaining > 0
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index 11eee06dc0..ccdff231e4 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -353,6 +353,10 @@ type OnIndexSeries interface {
 	RemoveIndexedForBlockStarts(
 		blockStarts map[xtime.UnixNano]struct{},
 	) RemoveIndexedForBlockStartsResult
+
+	RelookupAndIncrementReaderWriterCount() (OnIndexSeries, bool)
+
+	DecrementReaderWriterCount()
 }
 
 // RemoveIndexedForBlockStartsResult is the result from calling
diff --git a/src/dbnode/storage/series/lookup/entry.go b/src/dbnode/storage/series/lookup/entry.go
index d99838767a..669e91d365 100644
--- a/src/dbnode/storage/series/lookup/entry.go
+++ b/src/dbnode/storage/series/lookup/entry.go
@@ -58,13 +58,14 @@ type IndexWriter interface {
 // members to track lifecycle and minimize indexing overhead.
 // NB: users are expected to use `NewEntry` to construct these objects.
 type Entry struct {
-	Series                   series.DatabaseSeries
-	Index                    uint64
-	indexWriter              IndexWriter
-	curReadWriters           int32
-	reverseIndex             entryIndexState
-	nowFn                    clock.NowFn
-	pendingIndexBatchSizeOne []writes.PendingIndexInsert
+	relookupAndIncrementReaderWriterCount func() (index.OnIndexSeries, bool)
+	Series                                series.DatabaseSeries
+	Index                                 uint64
+	indexWriter                           IndexWriter
+	curReadWriters                        int32
+	reverseIndex                          entryIndexState
+	nowFn                                 clock.NowFn
+	pendingIndexBatchSizeOne              []writes.PendingIndexInsert
 }
 
 // OnReleaseReadWriteRef is a callback that can release
@@ -84,10 +85,11 @@ var _ bootstrap.SeriesRef = &Entry{}
 
 // NewEntryOptions supplies options for a new entry.
 type NewEntryOptions struct {
-	Series      series.DatabaseSeries
-	Index       uint64
-	IndexWriter IndexWriter
-	NowFn       clock.NowFn
+	RelookupAndIncrementReaderWriterCount func() (index.OnIndexSeries, bool)
+	Series                                series.DatabaseSeries
+	Index                                 uint64
+	IndexWriter                           IndexWriter
+	NowFn                                 clock.NowFn
 }
 
 // NewEntry returns a new Entry.
@@ -97,16 +99,22 @@ func NewEntry(opts NewEntryOptions) *Entry {
 		nowFn = opts.NowFn
 	}
 	entry := &Entry{
-		Series:                   opts.Series,
-		Index:                    opts.Index,
-		indexWriter:              opts.IndexWriter,
-		nowFn:                    nowFn,
-		pendingIndexBatchSizeOne: make([]writes.PendingIndexInsert, 1),
-		reverseIndex:             newEntryIndexState(),
+		relookupAndIncrementReaderWriterCount: opts.RelookupAndIncrementReaderWriterCount,
+		Series:                                opts.Series,
+		Index:                                 opts.Index,
+		indexWriter:                           opts.IndexWriter,
+		nowFn:                                 nowFn,
+		pendingIndexBatchSizeOne:              make([]writes.PendingIndexInsert, 1),
+		reverseIndex:                          newEntryIndexState(),
 	}
 	return entry
 }
 
+// RelookupAndIncrementReaderWriterCount will relookup the entry.
+func (entry *Entry) RelookupAndIncrementReaderWriterCount() (index.OnIndexSeries, bool) {
+	return entry.relookupAndIncrementReaderWriterCount()
+}
+
 // ReaderWriterCount returns the current ref count on the Entry.
 func (entry *Entry) ReaderWriterCount() int32 {
 	return atomic.LoadInt32(&entry.curReadWriters)
diff --git a/src/dbnode/storage/shard.go b/src/dbnode/storage/shard.go
index 8f4008e0ff..da97056d56 100644
--- a/src/dbnode/storage/shard.go
+++ b/src/dbnode/storage/shard.go
@@ -1274,6 +1274,13 @@ func (s *dbShard) newShardEntry(
 		Options:                s.seriesOpts,
 	})
 	return lookup.NewEntry(lookup.NewEntryOptions{
+		RelookupAndIncrementReaderWriterCount: func() (index.OnIndexSeries, bool) {
+			e, _, err := s.tryRetrieveWritableSeries(seriesID)
+			if err != nil || e == nil {
+				return nil, false
+			}
+			return e, true
+		},
 		Series:      newSeries,
 		Index:       uniqueIndex,
 		IndexWriter: s.reverseIndex,

From 3de16900b409d420e93a220a969d22a3065fa720 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 8 Apr 2021 17:24:03 -0400
Subject: [PATCH 092/106] Avoid index block ticking while holding index write
 lock

---
 src/dbnode/storage/index.go | 120 +++++++++++++++++++++++-------------
 1 file changed, 76 insertions(+), 44 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index 719c4d6f85..e042cc429b 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -921,58 +921,94 @@ func (i *nsIndex) Bootstrapped() bool {
 	return result
 }
 
-func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceIndexTickResult, error) {
-	var (
-		multiErr                   xerrors.MultiError
-		result                     = namespaceIndexTickResult{}
-		earliestBlockStartToRetain = retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, startTime)
-	)
+func (i *nsIndex) Tick(
+	c context.Cancellable,
+	startTime time.Time,
+) (namespaceIndexTickResult, error) {
+	var result namespaceIndexTickResult
+
+	// First collect blocks and acquire lock to remove those that need removing
+	// but then release lock so can Tick and do other expensive tasks
+	// such as notify of sealed blocks.
+	tickingBlocks, multiErr := i.tickingBlocks(startTime)
+
+	result.NumBlocks = int64(tickingBlocks.totalBlocks)
+	for _, block := range tickingBlocks.tickingBlocks {
+		if c.IsCancelled() {
+			multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation)
+			return result, multiErr.FinalError()
+		}
+
+		blockTickResult, tickErr := block.Tick(c)
+		multiErr = multiErr.Add(tickErr)
+		result.NumSegments += blockTickResult.NumSegments
+		result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
+		result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
+		result.NumTotalDocs += blockTickResult.NumDocs
+		result.FreeMmap += blockTickResult.FreeMmap
+	}
+
+	blockTickResult, tickErr := tickingBlocks.activeBlock.Tick(c)
+	multiErr = multiErr.Add(tickErr)
+	result.NumSegments += blockTickResult.NumSegments
+	result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
+	result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
+	result.NumTotalDocs += blockTickResult.NumDocs
+	result.FreeMmap += blockTickResult.FreeMmap
+
+	// Notify in memory block of sealed blocks
+	// and make sure to do this out of the lock since
+	// this can take a considerable amount of time
+	// and is an expensive task that doesn't require
+	// holding the index lock.
+	_ = tickingBlocks.activeBlock.InMemoryBlockNotifySealedBlocks(tickingBlocks.sealedBlocks)
+	i.metrics.blocksNotifySealed.Inc(int64(len(tickingBlocks.sealedBlocks)))
+	i.metrics.tick.Inc(1)
+
+	return result, multiErr.FinalError()
+}
+
+type tickingBlocksResult struct {
+	totalBlocks   int
+	evictedBlocks int
+	activeBlock   index.Block
+	tickingBlocks []index.Block
+	sealedBlocks  []xtime.UnixNano
+}
+
+func (i *nsIndex) tickingBlocks(
+	startTime time.Time,
+) (tickingBlocksResult, xerrors.MultiError) {
+	multiErr := xerrors.NewMultiError()
+	earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(
+		i.retentionPeriod, i.blockSize, startTime)
+	evictedBlocks := 0
 
 	i.state.Lock()
+	activeBlock := i.inMemoryBlock
+	tickingBlocks := make([]index.Block, 0, len(i.state.blocksByTime))
 	sealedBlocks := make([]xtime.UnixNano, 0, len(i.state.blocksByTime))
 	defer func() {
 		i.updateBlockStartsWithLock()
-		activeBlock := i.inMemoryBlock
 		i.state.Unlock()
-		// Notify in memory block of sealed blocks
-		// and make sure to do this out of the lock since
-		// this can take a considerable amount of time
-		// and is an expensive task that doesn't require
-		// holding the index lock.
-		_ = activeBlock.InMemoryBlockNotifySealedBlocks(sealedBlocks)
-		i.metrics.blocksNotifySealed.Inc(int64(len(sealedBlocks)))
-		i.metrics.tick.Inc(1)
 	}()
 
-	result.NumBlocks = int64(len(i.state.blocksByTime))
 	for blockStart, block := range i.state.blocksByTime {
-		if c.IsCancelled() {
-			multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation)
-			return result, multiErr.FinalError()
-		}
-
-		// drop any blocks past the retention period
+		// Drop any blocks past the retention period.
 		if blockStart.ToTime().Before(earliestBlockStartToRetain) {
 			multiErr = multiErr.Add(block.Close())
 			delete(i.state.blocksByTime, blockStart)
-			result.NumBlocksEvicted++
-			result.NumBlocks--
+			evictedBlocks++
 			continue
 		}
 
-		// tick any blocks we're going to retain
-		blockTickResult, tickErr := block.Tick(c)
-		multiErr = multiErr.Add(tickErr)
-		result.NumSegments += blockTickResult.NumSegments
-		result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
-		result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
-		result.NumTotalDocs += blockTickResult.NumDocs
-		result.FreeMmap += blockTickResult.FreeMmap
+		// Tick any blocks we're going to retain, but don't tick inline here
+		// we'll do this out of the block.
+		tickingBlocks = append(tickingBlocks, block)
 
-		// seal any blocks that are sealable
+		// Seal any blocks that are sealable while holding lock (seal is fast).
 		if !blockStart.ToTime().After(i.lastSealableBlockStart(startTime)) && !block.IsSealed() {
 			multiErr = multiErr.Add(block.Seal())
-			result.NumBlocksSealed++
 		}
 
 		if block.IsSealed() {
@@ -980,16 +1016,12 @@ func (i *nsIndex) Tick(c context.Cancellable, startTime time.Time) (namespaceInd
 		}
 	}
 
-	block := i.inMemoryBlock
-	blockTickResult, tickErr := block.Tick(c)
-	multiErr = multiErr.Add(tickErr)
-	result.NumSegments += blockTickResult.NumSegments
-	result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
-	result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
-	result.NumTotalDocs += blockTickResult.NumDocs
-	result.FreeMmap += blockTickResult.FreeMmap
-
-	return result, multiErr.FinalError()
+	return tickingBlocksResult{
+		totalBlocks:   len(i.state.blocksByTime),
+		activeBlock:   activeBlock,
+		tickingBlocks: tickingBlocks,
+		sealedBlocks:  sealedBlocks,
+	}, multiErr
 }
 
 func (i *nsIndex) WarmFlush(

From da1aeb4bcb50330b59f8fbcb8e3f2d3ba78bb0b7 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 5 Apr 2021 23:14:46 -0400
Subject: [PATCH 093/106] Rerun cached searches when replacing background
 segments to avoid needing to rebuild all cached searches again

---
 src/dbnode/storage/index/mutable_segments.go  |  89 +++++++--
 .../storage/index/postings_list_cache.go      | 187 ++++++++++++++++--
 .../storage/index/read_through_segment.go     |  69 +++----
 3 files changed, 273 insertions(+), 72 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index a08b123ccb..dd8f23341c 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -113,13 +113,14 @@ func (f *indexedBloomFilter) Write(id []byte) {
 }
 
 type mutableSegmentsMetrics struct {
-	foregroundCompactionPlanRunLatency tally.Timer
-	foregroundCompactionTaskRunLatency tally.Timer
-	backgroundCompactionPlanRunLatency tally.Timer
-	backgroundCompactionTaskRunLatency tally.Timer
-	activeBlockIndexNew                tally.Counter
-	activeBlockGarbageCollectSegment   tally.Counter
-	activeBlockGarbageCollectSeries    tally.Counter
+	foregroundCompactionPlanRunLatency    tally.Timer
+	foregroundCompactionTaskRunLatency    tally.Timer
+	backgroundCompactionPlanRunLatency    tally.Timer
+	backgroundCompactionTaskRunLatency    tally.Timer
+	activeBlockIndexNew                   tally.Counter
+	activeBlockGarbageCollectSegment      tally.Counter
+	activeBlockGarbageCollectSeries       tally.Counter
+	backgroundCompactionRerunCachedSearch tally.Counter
 }
 
 func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
@@ -134,8 +135,9 @@ func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
 		activeBlockIndexNew: activeBlockScope.Tagged(map[string]string{
 			"result_type": "new",
 		}).Counter("index-result"),
-		activeBlockGarbageCollectSegment: activeBlockScope.Counter("gc-segment"),
-		activeBlockGarbageCollectSeries:  activeBlockScope.Counter("gc-series"),
+		activeBlockGarbageCollectSegment:      activeBlockScope.Counter("gc-segment"),
+		activeBlockGarbageCollectSeries:       activeBlockScope.Counter("gc-series"),
+		backgroundCompactionRerunCachedSearch: backgroundScope.Counter("rerun-cached-search"),
 	}
 }
 
@@ -642,7 +644,7 @@ func (m *mutableSegments) backgroundCompactWithPlan(
 	wg.Wait()
 }
 
-func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) segment.Segment {
+func (m *mutableSegments) newReadThroughSegment(seg fst.Segment) *ReadThroughSegment {
 	var (
 		plCaches = ReadThroughSegmentCaches{
 			SegmentPostingsListCache: m.opts.PostingsListCache(),
@@ -733,12 +735,73 @@ func (m *mutableSegments) backgroundCompactWithTask(
 		return err
 	}
 
-	var replaceSegment segment.Segment
+	var replaceSeg segment.Segment
 	if !empty {
 		// Add a read through cache for repeated expensive queries against
 		// background compacted segments since they can live for quite some
 		// time and accrue a large set of documents.
-		replaceSegment = m.newReadThroughSegment(compacted)
+		readThroughSeg := m.newReadThroughSegment(compacted)
+		replaceSeg = readThroughSeg
+
+		// NB(r): Before replacing the old segments with the compacted segment
+		// we rebuild all the cached postings lists that the previous segment had
+		// to avoid latency spikes during segment rotation.
+		// Note: There was very obvious peaks of latency (p99 of <500ms spiking
+		// to 8 times that at first replace of large segments after a block
+		// rotation) without this optimization.
+		for _, segment := range segments {
+			prevReadThroughSeg, ok := segment.(*ReadThroughSegment)
+			if !ok {
+				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+					l.Error("failed to cast compacted segment to read through segment")
+				})
+				continue
+			}
+
+			searches := prevReadThroughSeg.CachedSearchPatterns()
+			for _, s := range searches {
+				r, err := readThroughSeg.Reader()
+				if err != nil {
+					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+						l.Error("failed to create read through segment reader")
+					})
+					continue
+				}
+
+				if s.SearchQuery == nil {
+					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+						l.Error("no search query for cached search pattern")
+					})
+					continue
+				}
+
+				searcher, err := s.SearchQuery.Searcher()
+				if err != nil {
+					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+						l.Error("failed to create searcher from cached search pattern")
+					})
+					continue
+				}
+
+				pl, err := searcher.Search(r)
+				if err != nil {
+					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+						l.Error("failed to create searcher from cached search pattern")
+					})
+					continue
+				}
+
+				if err := r.Close(); err != nil {
+					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+						l.Error("failed to close read through segment reader")
+					})
+					continue
+				}
+
+				readThroughSeg.PutCachedSearchPattern(s.Field, s.SearchQuery, pl)
+				m.metrics.backgroundCompactionRerunCachedSearch.Inc(1)
+			}
+		}
 	}
 
 	// Rotate out the replaced frozen segments and add the compacted one.
@@ -746,7 +809,7 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	defer m.Unlock()
 
 	result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments,
-		segments, replaceSegment)
+		segments, replaceSeg)
 	m.backgroundSegments = result
 
 	return nil
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index 3f1b6fc8c2..f854da0541 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -23,10 +23,12 @@ package index
 import (
 	"bytes"
 	"errors"
+	"sync"
 	"time"
 
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
+	"github.com/m3db/m3/src/m3ninx/search"
 	"github.com/m3db/m3/src/x/instrument"
 
 	"github.com/cespare/xxhash/v2"
@@ -86,7 +88,35 @@ type PostingsListCache struct {
 	size    int
 	opts    PostingsListCacheOptions
 	metrics *postingsListCacheMetrics
-	logger  *zap.Logger
+
+	registry postingsListCacheRegistry
+
+	logger *zap.Logger
+}
+
+type postingsListCacheRegistry struct {
+	sync.RWMutex
+	eventCh chan postingsListEvent
+	active  map[uuid.Array]map[registryKey]postings.List
+}
+
+type registryKey struct {
+	field       string
+	pattern     string
+	patternType PatternType
+	searchQuery search.Query
+}
+
+type postingsListEventType int
+
+const (
+	addEventType postingsListEventType = iota
+	removeEventType
+)
+
+type postingsListEvent struct {
+	eventType      postingsListEventType
+	cachedPostings *cachedPostings
 }
 
 // NewPostingsListCache creates a new query cache.
@@ -99,28 +129,43 @@ func NewPostingsListCache(
 		return nil, nil, err
 	}
 
-	lru, err := ristretto.NewCache(&ristretto.Config{
+	plc := &PostingsListCache{
+		size: size,
+		opts: opts,
+		registry: postingsListCacheRegistry{
+			eventCh: make(chan postingsListEvent, 4096),
+			active:  make(map[uuid.Array]map[registryKey]postings.List),
+		},
+		metrics: newPostingsListCacheMetrics(opts.InstrumentOptions.MetricsScope()),
+		logger:  opts.InstrumentOptions.Logger(),
+	}
+	plc.lru, err = ristretto.NewCache(&ristretto.Config{
 		NumCounters: int64(10 * size), // number of keys to track frequency of.
 		MaxCost:     int64(size),      // maximum cost of cache.
 		BufferItems: 64,               // number of keys per Get buffer.
 		KeyToHash: func(k interface{}) (uint64, uint64) {
 			return k.(uint64), 0
 		},
+		OnEvict: plc.onEvict,
 	})
 	if err != nil {
 		return nil, nil, err
 	}
 
-	plc := &PostingsListCache{
-		size:    size,
-		lru:     lru,
-		opts:    opts,
-		metrics: newPostingsListCacheMetrics(opts.InstrumentOptions.MetricsScope()),
-		logger:  opts.InstrumentOptions.Logger(),
+	closer := plc.startLoop()
+	return plc, closer, nil
+}
+
+func (q *PostingsListCache) onEvict(key, conflict uint64, value interface{}, cost int64) {
+	v, ok := value.(*cachedPostings)
+	if !ok {
+		return
 	}
 
-	closer := plc.startReportLoop()
-	return plc, closer, nil
+	q.registry.eventCh <- postingsListEvent{
+		eventType:      removeEventType,
+		cachedPostings: v,
+	}
 }
 
 // GetRegexp returns the cached results for the provided regexp query, if any.
@@ -188,6 +233,8 @@ type cachedPostings struct {
 	field       string
 	pattern     string
 	patternType PatternType
+	// searchQuery is only set for search queries.
+	searchQuery search.Query
 
 	// value
 	postings postings.List
@@ -215,7 +262,7 @@ func (q *PostingsListCache) PutRegexp(
 	pattern string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, pattern, PatternTypeRegexp, pl)
+	q.put(segmentUUID, field, pattern, PatternTypeRegexp, nil, pl)
 }
 
 // PutTerm updates the LRU with the result of the term query.
@@ -225,7 +272,7 @@ func (q *PostingsListCache) PutTerm(
 	pattern string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, pattern, PatternTypeTerm, pl)
+	q.put(segmentUUID, field, pattern, PatternTypeTerm, nil, pl)
 }
 
 // PutField updates the LRU with the result of the field query.
@@ -234,13 +281,14 @@ func (q *PostingsListCache) PutField(
 	field string,
 	pl postings.List,
 ) {
-	q.put(segmentUUID, field, emptyPattern, PatternTypeField, pl)
+	q.put(segmentUUID, field, emptyPattern, PatternTypeField, nil, pl)
 }
 
 // PutSearch updates the LRU with the result of a search query.
 func (q *PostingsListCache) PutSearch(
 	segmentUUID uuid.UUID,
-	query string,
+	queryStr string,
+	query search.Query,
 	pl postings.List,
 ) {
 	if roaring.IsReadOnlyPostingsList(pl) {
@@ -256,7 +304,7 @@ func (q *PostingsListCache) PutSearch(
 		pl = mutable
 	}
 
-	q.put(segmentUUID, query, emptyPattern, PatternTypeSearch, pl)
+	q.put(segmentUUID, queryStr, emptyPattern, PatternTypeSearch, query, pl)
 }
 
 func (q *PostingsListCache) put(
@@ -264,6 +312,7 @@ func (q *PostingsListCache) put(
 	field string,
 	pattern string,
 	patternType PatternType,
+	searchQuery search.Query,
 	pl postings.List,
 ) {
 	key := keyHash(segmentUUID, field, pattern, patternType)
@@ -272,16 +321,21 @@ func (q *PostingsListCache) put(
 		field:       field,
 		pattern:     pattern,
 		patternType: patternType,
+		searchQuery: searchQuery,
 		postings:    pl,
 	}
 	q.lru.Set(key, value, 1)
 	q.emitCachePutMetrics(patternType)
+	q.registry.eventCh <- postingsListEvent{
+		eventType:      addEventType,
+		cachedPostings: value,
+	}
 }
 
-// startReportLoop starts a background process that will call Report()
+// startLoop starts a background process that will call Report()
 // on a regular basis and returns a function that will end the background
 // process.
-func (q *PostingsListCache) startReportLoop() Closer {
+func (q *PostingsListCache) startLoop() Closer {
 	doneCh := make(chan struct{})
 
 	go func() {
@@ -297,9 +351,108 @@ func (q *PostingsListCache) startReportLoop() Closer {
 		}
 	}()
 
+	go func() {
+		for {
+			// Process first without lock (just wait blindly).
+			var ev postingsListEvent
+			select {
+			case <-doneCh:
+				return
+			case ev = <-q.registry.eventCh:
+			}
+
+			// Now acquire lock and process as many as can while batched.
+			q.registry.Lock()
+			// Process first.
+			q.processEventWithLock(ev)
+			// Process as many while holding lock until no more to read.
+			for more := true; more; {
+				select {
+				case ev = <-q.registry.eventCh:
+					q.processEventWithLock(ev)
+				default:
+					more = false
+				}
+			}
+			q.registry.Unlock()
+		}
+	}()
+
 	return func() { close(doneCh) }
 }
 
+type CachedPattern struct {
+	Field       string
+	Pattern     string
+	PatternType PatternType
+	SearchQuery search.Query
+	Postings    postings.List
+}
+
+func (q *PostingsListCache) CachedPatterns(
+	uuid uuid.UUID,
+	patternType PatternType,
+) []CachedPattern {
+	q.registry.RLock()
+	defer q.registry.RUnlock()
+
+	segmentPostings, ok := q.registry.active[uuid.Array()]
+	if !ok {
+		return nil
+	}
+
+	n := 0
+	for key := range segmentPostings {
+		if patternType == key.patternType {
+			n++
+		}
+	}
+
+	if n == 0 {
+		return nil
+	}
+
+	patterns := make([]CachedPattern, 0, n)
+	for key, value := range segmentPostings {
+		if patternType == key.patternType {
+			patterns = append(patterns, CachedPattern{
+				Field:       key.field,
+				Pattern:     key.pattern,
+				PatternType: key.patternType,
+				SearchQuery: key.searchQuery,
+				Postings:    value,
+			})
+		}
+	}
+
+	return patterns
+}
+
+func (q *PostingsListCache) processEventWithLock(ev postingsListEvent) {
+	uuid := ev.cachedPostings.segmentUUID.Array()
+	key := registryKey{
+		field:       ev.cachedPostings.field,
+		pattern:     ev.cachedPostings.pattern,
+		patternType: ev.cachedPostings.patternType,
+		searchQuery: ev.cachedPostings.searchQuery,
+	}
+	segmentPostings, ok := q.registry.active[uuid]
+	if !ok {
+		segmentPostings = make(map[registryKey]postings.List)
+		q.registry.active[uuid] = segmentPostings
+	}
+
+	switch ev.eventType {
+	case removeEventType:
+		delete(segmentPostings, key)
+		if len(segmentPostings) == 0 {
+			delete(q.registry.active, uuid)
+		}
+	case addEventType:
+		segmentPostings[key] = ev.cachedPostings.postings
+	}
+}
+
 // Report will emit metrics about the status of the cache.
 func (q *PostingsListCache) Report() {
 	q.metrics.capacity.Update(float64(q.size))
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index a94399882a..f51cda1266 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -59,16 +59,9 @@ type ReadThroughSegment struct {
 
 	opts ReadThroughSegmentOptions
 
-	searches readThroughSegmentSearches
-
 	closed bool
 }
 
-type readThroughSegmentSearches struct {
-	sync.RWMutex
-	queries map[string]int
-}
-
 // ReadThroughSegmentCaches is the set of caches
 // to use for the read through segment.
 type ReadThroughSegmentCaches struct {
@@ -95,15 +88,12 @@ func NewReadThroughSegment(
 	seg segment.ImmutableSegment,
 	caches ReadThroughSegmentCaches,
 	opts ReadThroughSegmentOptions,
-) segment.Segment {
+) *ReadThroughSegment {
 	return &ReadThroughSegment{
 		segment: seg,
 		opts:    opts,
 		uuid:    uuid.NewUUID(),
 		caches:  caches,
-		searches: readThroughSegmentSearches{
-			queries: make(map[string]int),
-		},
 	}
 }
 
@@ -171,6 +161,28 @@ func (r *ReadThroughSegment) Size() int64 {
 	return r.segment.Size()
 }
 
+func (r *ReadThroughSegment) PutCachedSearchPattern(
+	queryStr string,
+	query search.Query,
+	pl postings.List,
+) {
+	cache := r.caches.SearchPostingsListCache
+	if cache == nil || !r.opts.CacheSearches {
+		return
+	}
+
+	cache.PutSearch(r.uuid, queryStr, query, pl)
+}
+
+func (r *ReadThroughSegment) CachedSearchPatterns() []CachedPattern {
+	cache := r.caches.SearchPostingsListCache
+	if cache == nil || !r.opts.CacheSearches {
+		return nil
+	}
+
+	return cache.CachedPatterns(r.uuid, PatternTypeSearch)
+}
+
 var _ search.ReadThroughSegmentSearcher = (*readThroughSegmentReader)(nil)
 
 type readThroughSegmentReader struct {
@@ -357,37 +369,10 @@ func (s *readThroughSegmentReader) Search(
 		return nil, err
 	}
 
-	s.seg.searches.Lock()
-	count := 1
-	curr, ok := s.seg.searches.queries[queryStr]
-	if !ok {
-		if len(s.seg.searches.queries) >= cache.size {
-			// Delete a random key to make room.
-			for k := range s.seg.searches.queries {
-				delete(s.seg.searches.queries, k)
-				break // Immediately break.
-			}
-			s.seg.searches.queries[queryStr] = count
-		}
-	} else {
-		count = curr + 1
-	}
-	willCache := count > 1
-	if willCache {
-		// Delete out of the seen query count.
-		delete(s.seg.searches.queries, queryStr)
-	} else {
-		// Update seen count.
-		s.seg.searches.queries[queryStr] = count
-	}
-	s.seg.searches.Unlock()
-
-	if willCache {
-		// Only cache the second time seen a recent query since
-		// copying the postings lists into a roaring postings list
-		// can be expensive (in PutSearch).
-		cache.PutSearch(s.uuid, queryStr, pl)
-	}
+	// Only cache the second time seen a recent query since
+	// copying the postings lists into a roaring postings list
+	// can be expensive (in PutSearch).
+	cache.PutSearch(s.uuid, queryStr, query, pl)
 
 	return pl, nil
 }

From 0738d68e84b1865df014acc3feded7acc1b8ea4a Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 6 Apr 2021 15:58:33 -0400
Subject: [PATCH 094/106] Add concurrent execution of cached searches

---
 src/dbnode/storage/index.go                   |   4 +-
 src/dbnode/storage/index/block.go             |  58 +++++--
 src/dbnode/storage/index/mutable_segments.go  | 159 +++++++++++-------
 .../storage/index/postings_list_cache.go      |  26 ++-
 .../storage/index/read_through_segment.go     |  19 ++-
 src/dbnode/storage/index/types.go             |   2 +-
 6 files changed, 177 insertions(+), 91 deletions(-)

diff --git a/src/dbnode/storage/index.go b/src/dbnode/storage/index.go
index e042cc429b..2dfde2beee 100644
--- a/src/dbnode/storage/index.go
+++ b/src/dbnode/storage/index.go
@@ -1111,7 +1111,9 @@ func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) {
 	}
 	// We only rotate cold mutable segments in phase I of cold flushing.
 	for _, block := range flushable {
-		block.RotateColdMutableSegments()
+		if err := block.RotateColdMutableSegments(); err != nil {
+			return nil, err
+		}
 	}
 	// We can't immediately evict cold mutable segments so we return a callback to do so
 	// when cold flush finishes.
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index 996c5a8375..a1ebcd2330 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -26,6 +26,7 @@ import (
 	"fmt"
 	"io"
 	"math"
+	"runtime"
 	"sort"
 	"sync"
 	"time"
@@ -147,6 +148,7 @@ type block struct {
 	queryLimits                     limits.QueryLimits
 	docsLimit                       limits.LookbackLimit
 	querySegmentsWorkers            xsync.WorkerPool
+	cachedSearchesWorkers           xsync.PooledWorkerPool
 
 	metrics blockMetrics
 	logger  *zap.Logger
@@ -223,26 +225,45 @@ func NewBlock(
 	iopts := opts.InstrumentOptions()
 	scope := iopts.MetricsScope().SubScope("index").SubScope("block")
 	iopts = iopts.SetMetricsScope(scope)
-	segs := newMutableSegments(
+
+	cpus := int(math.Max(1, math.Ceil(0.25*float64(runtime.NumCPU()))))
+	poolOpts := xsync.NewPooledWorkerPoolOptions().
+		SetGrowOnDemand(false).
+		SetNumShards(1).
+		SetInstrumentOptions(iopts.SetMetricsScope(iopts.MetricsScope().SubScope("cached-searches")))
+	cachedSearchesWorkers, err := xsync.NewPooledWorkerPool(cpus, poolOpts)
+	if err != nil {
+		return nil, err
+	}
+
+	segs, err := newMutableSegments(
 		md,
 		blockStart,
 		opts,
 		blockOpts,
+		cachedSearchesWorkers,
 		namespaceRuntimeOptsMgr,
 		iopts,
 	)
+	if err != nil {
+		return nil, err
+	}
 
-	// NB(bodu): The length of coldMutableSegments is always at least 1.
-	coldSegs := []*mutableSegments{
-		newMutableSegments(
-			md,
-			blockStart,
-			opts,
-			blockOpts,
-			namespaceRuntimeOptsMgr,
-			iopts,
-		),
+	coldSegs, err := newMutableSegments(
+		md,
+		blockStart,
+		opts,
+		blockOpts,
+		cachedSearchesWorkers,
+		namespaceRuntimeOptsMgr,
+		iopts,
+	)
+	if err != nil {
+		return nil, err
 	}
+
+	// NB(bodu): The length of coldMutableSegments is always at least 1.
+	coldMutableSegments := []*mutableSegments{coldSegs}
 	b := &block{
 		state:                           blockStateOpen,
 		blockStart:                      blockStart,
@@ -250,7 +271,7 @@ func NewBlock(
 		blockSize:                       blockSize,
 		blockOpts:                       blockOpts,
 		mutableSegments:                 segs,
-		coldMutableSegments:             coldSegs,
+		coldMutableSegments:             coldMutableSegments,
 		shardRangesSegmentsByVolumeType: make(shardRangesSegmentsByVolumeType),
 		opts:                            opts,
 		iopts:                           iopts,
@@ -261,6 +282,7 @@ func NewBlock(
 		queryLimits:                     opts.QueryLimits(),
 		docsLimit:                       opts.QueryLimits().DocsLimit(),
 		querySegmentsWorkers:            opts.QueryBlockSegmentWorkerPool(),
+		cachedSearchesWorkers:           cachedSearchesWorkers,
 	}
 	b.newFieldsAndTermsIteratorFn = newFieldsAndTermsIterator
 
@@ -1226,17 +1248,23 @@ func (b *block) EvictColdMutableSegments() error {
 	return nil
 }
 
-func (b *block) RotateColdMutableSegments() {
+func (b *block) RotateColdMutableSegments() error {
 	b.Lock()
 	defer b.Unlock()
-	b.coldMutableSegments = append(b.coldMutableSegments, newMutableSegments(
+	coldSegs, err := newMutableSegments(
 		b.nsMD,
 		b.blockStart,
 		b.opts,
 		b.blockOpts,
+		b.cachedSearchesWorkers,
 		b.namespaceRuntimeOptsMgr,
 		b.iopts,
-	))
+	)
+	if err != nil {
+		return err
+	}
+	b.coldMutableSegments = append(b.coldMutableSegments, coldSegs)
+	return nil
 }
 
 func (b *block) MemorySegmentsData(ctx context.Context) ([]fst.SegmentData, error) {
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index dd8f23341c..15e82209b4 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -40,6 +40,7 @@ import (
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	xresource "github.com/m3db/m3/src/x/resource"
+	xsync "github.com/m3db/m3/src/x/sync"
 	xtime "github.com/m3db/m3/src/x/time"
 
 	"github.com/uber-go/tally"
@@ -81,6 +82,7 @@ type mutableSegments struct {
 	iopts                    instrument.Options
 	optsListener             xresource.SimpleCloser
 	writeIndexingConcurrency int
+	cachedSearchesWorkers    xsync.PooledWorkerPool
 
 	sealedBlockStarts          map[xtime.UnixNano]struct{}
 	backgroundCompactGCPending bool
@@ -113,14 +115,20 @@ func (f *indexedBloomFilter) Write(id []byte) {
 }
 
 type mutableSegmentsMetrics struct {
-	foregroundCompactionPlanRunLatency    tally.Timer
-	foregroundCompactionTaskRunLatency    tally.Timer
-	backgroundCompactionPlanRunLatency    tally.Timer
-	backgroundCompactionTaskRunLatency    tally.Timer
-	activeBlockIndexNew                   tally.Counter
-	activeBlockGarbageCollectSegment      tally.Counter
-	activeBlockGarbageCollectSeries       tally.Counter
-	backgroundCompactionRerunCachedSearch tally.Counter
+	foregroundCompactionPlanRunLatency                   tally.Timer
+	foregroundCompactionTaskRunLatency                   tally.Timer
+	backgroundCompactionPlanRunLatency                   tally.Timer
+	backgroundCompactionTaskRunLatency                   tally.Timer
+	activeBlockIndexNew                                  tally.Counter
+	activeBlockGarbageCollectSegment                     tally.Counter
+	activeBlockGarbageCollectSeries                      tally.Counter
+	activeBlockGarbageCollectEmptySegment                tally.Counter
+	activeBlockGarbageCollectCachedSearchesDisabled      tally.Counter
+	activeBlockGarbageCollectCachedSearchesInRegistry    tally.Counter
+	activeBlockGarbageCollectCachedSearchesNotInRegistry tally.Counter
+	activeBlockGarbageCollectCachedSearchesTotal         tally.Histogram
+	activeBlockGarbageCollectCachedSearchesMatched       tally.Histogram
+	activeBlockGarbageCollectRerunCachedSearch           tally.Counter
 }
 
 func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
@@ -135,9 +143,21 @@ func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
 		activeBlockIndexNew: activeBlockScope.Tagged(map[string]string{
 			"result_type": "new",
 		}).Counter("index-result"),
-		activeBlockGarbageCollectSegment:      activeBlockScope.Counter("gc-segment"),
-		activeBlockGarbageCollectSeries:       activeBlockScope.Counter("gc-series"),
-		backgroundCompactionRerunCachedSearch: backgroundScope.Counter("rerun-cached-search"),
+		activeBlockGarbageCollectSegment:                activeBlockScope.Counter("gc-segment"),
+		activeBlockGarbageCollectSeries:                 activeBlockScope.Counter("gc-series"),
+		activeBlockGarbageCollectEmptySegment:           backgroundScope.Counter("gc-empty-segment"),
+		activeBlockGarbageCollectCachedSearchesDisabled: backgroundScope.Counter("gc-cached-searches-disabled"),
+		activeBlockGarbageCollectCachedSearchesInRegistry: backgroundScope.Tagged(map[string]string{
+			"found": "true",
+		}).Counter("gc-cached-searches-in-registry"),
+		activeBlockGarbageCollectCachedSearchesNotInRegistry: backgroundScope.Tagged(map[string]string{
+			"found": "false",
+		}).Counter("gc-cached-searches-in-registry"),
+		activeBlockGarbageCollectCachedSearchesTotal: backgroundScope.Histogram("gc-cached-searches-total",
+			append(tally.ValueBuckets{0, 1}, tally.MustMakeExponentialValueBuckets(2, 2, 12)...)),
+		activeBlockGarbageCollectCachedSearchesMatched: backgroundScope.Histogram("gc-cached-searches-matched",
+			append(tally.ValueBuckets{0, 1}, tally.MustMakeExponentialValueBuckets(2, 2, 12)...)),
+		activeBlockGarbageCollectRerunCachedSearch: backgroundScope.Counter("gc-rerun-cached-search"),
 	}
 }
 
@@ -148,22 +168,24 @@ func newMutableSegments(
 	blockStart time.Time,
 	opts Options,
 	blockOpts BlockOptions,
+	cachedSearchesWorkers xsync.PooledWorkerPool,
 	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
 	iopts instrument.Options,
-) *mutableSegments {
+) (*mutableSegments, error) {
 	m := &mutableSegments{
-		blockStart:        blockStart,
-		blockSize:         md.Options().IndexOptions().BlockSize(),
-		opts:              opts,
-		blockOpts:         blockOpts,
-		compact:           mutableSegmentsCompact{opts: opts, blockOpts: blockOpts},
-		sealedBlockStarts: make(map[xtime.UnixNano]struct{}),
-		iopts:             iopts,
-		metrics:           newMutableSegmentsMetrics(iopts.MetricsScope()),
-		logger:            iopts.Logger(),
+		blockStart:            blockStart,
+		blockSize:             md.Options().IndexOptions().BlockSize(),
+		opts:                  opts,
+		blockOpts:             blockOpts,
+		compact:               mutableSegmentsCompact{opts: opts, blockOpts: blockOpts},
+		cachedSearchesWorkers: cachedSearchesWorkers,
+		sealedBlockStarts:     make(map[xtime.UnixNano]struct{}),
+		iopts:                 iopts,
+		metrics:               newMutableSegmentsMetrics(iopts.MetricsScope()),
+		logger:                iopts.Logger(),
 	}
 	m.optsListener = namespaceRuntimeOptsMgr.RegisterListener(m)
-	return m
+	return m, nil
 }
 
 func (m *mutableSegments) NotifySealedBlocks(
@@ -736,7 +758,9 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	}
 
 	var replaceSeg segment.Segment
-	if !empty {
+	if empty {
+		m.metrics.activeBlockGarbageCollectEmptySegment.Inc(1)
+	} else {
 		// Add a read through cache for repeated expensive queries against
 		// background compacted segments since they can live for quite some
 		// time and accrue a large set of documents.
@@ -758,48 +782,65 @@ func (m *mutableSegments) backgroundCompactWithTask(
 				continue
 			}
 
-			searches := prevReadThroughSeg.CachedSearchPatterns()
+			searches, result := prevReadThroughSeg.CachedSearchPatterns()
+			if result.CacheSearchesDisabled {
+				m.metrics.activeBlockGarbageCollectCachedSearchesDisabled.Inc(1)
+			}
+			if result.CachedPatternsResult.InRegistry {
+				m.metrics.activeBlockGarbageCollectCachedSearchesInRegistry.Inc(1)
+			} else {
+				m.metrics.activeBlockGarbageCollectCachedSearchesNotInRegistry.Inc(1)
+			}
+			total := float64(result.CachedPatternsResult.TotalPatterns)
+			m.metrics.activeBlockGarbageCollectCachedSearchesTotal.RecordValue(total)
+			matched := float64(len(searches))
+			m.metrics.activeBlockGarbageCollectCachedSearchesMatched.RecordValue(matched)
 			for _, s := range searches {
-				r, err := readThroughSeg.Reader()
-				if err != nil {
-					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-						l.Error("failed to create read through segment reader")
-					})
-					continue
-				}
+				s := s // Capture for loop.
+				m.cachedSearchesWorkers.Go(func() {
+					r, err := readThroughSeg.Reader()
+					if err != nil {
+						instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+							l.Error("failed to create read through segment reader")
+						})
+						return
+					}
 
-				if s.SearchQuery == nil {
-					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-						l.Error("no search query for cached search pattern")
-					})
-					continue
-				}
+					defer func() {
+						if err := r.Close(); err != nil {
+							instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+								l.Error("failed to close read through segment reader")
+							})
+						}
+					}()
+
+					if s.SearchQuery == nil {
+						instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+							l.Error("no search query for cached search pattern")
+						})
+						return
+					}
 
-				searcher, err := s.SearchQuery.Searcher()
-				if err != nil {
-					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-						l.Error("failed to create searcher from cached search pattern")
-					})
-					continue
-				}
+					searcher, err := s.SearchQuery.Searcher()
+					if err != nil {
+						instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+							l.Error("failed to create searcher from cached search pattern")
+						})
+						return
+					}
 
-				pl, err := searcher.Search(r)
-				if err != nil {
-					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-						l.Error("failed to create searcher from cached search pattern")
-					})
-					continue
-				}
+					pl, err := searcher.Search(r)
+					if err != nil {
+						instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+							l.Error("failed to create searcher from cached search pattern")
+						})
+						return
+					}
 
-				if err := r.Close(); err != nil {
-					instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-						l.Error("failed to close read through segment reader")
-					})
-					continue
-				}
+					readThroughSeg.PutCachedSearchPattern(s.Field, s.SearchQuery, pl)
+					m.metrics.activeBlockGarbageCollectRerunCachedSearch.Inc(1)
+				})
 
-				readThroughSeg.PutCachedSearchPattern(s.Field, s.SearchQuery, pl)
-				m.metrics.backgroundCompactionRerunCachedSearch.Inc(1)
 			}
 		}
 	}
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index f854da0541..2098d32899 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -389,30 +389,37 @@ type CachedPattern struct {
 	Postings    postings.List
 }
 
+type CachedPatternsResult struct {
+	InRegistry      bool
+	TotalPatterns   int
+	MatchedPatterns int
+}
+
 func (q *PostingsListCache) CachedPatterns(
 	uuid uuid.UUID,
 	patternType PatternType,
-) []CachedPattern {
+) ([]CachedPattern, CachedPatternsResult) {
+	var result CachedPatternsResult
+
 	q.registry.RLock()
 	defer q.registry.RUnlock()
 
 	segmentPostings, ok := q.registry.active[uuid.Array()]
 	if !ok {
-		return nil
+		return nil, result
 	}
 
-	n := 0
+	result.InRegistry = true
 	for key := range segmentPostings {
 		if patternType == key.patternType {
-			n++
+			result.TotalPatterns++
 		}
 	}
-
-	if n == 0 {
-		return nil
+	if result.TotalPatterns == 0 {
+		return nil, CachedPatternsResult{}
 	}
 
-	patterns := make([]CachedPattern, 0, n)
+	patterns := make([]CachedPattern, 0, result.TotalPatterns)
 	for key, value := range segmentPostings {
 		if patternType == key.patternType {
 			patterns = append(patterns, CachedPattern{
@@ -422,10 +429,11 @@ func (q *PostingsListCache) CachedPatterns(
 				SearchQuery: key.searchQuery,
 				Postings:    value,
 			})
+			result.MatchedPatterns++
 		}
 	}
 
-	return patterns
+	return patterns, result
 }
 
 func (q *PostingsListCache) processEventWithLock(ev postingsListEvent) {
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index f51cda1266..d858f9b061 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -174,13 +174,23 @@ func (r *ReadThroughSegment) PutCachedSearchPattern(
 	cache.PutSearch(r.uuid, queryStr, query, pl)
 }
 
-func (r *ReadThroughSegment) CachedSearchPatterns() []CachedPattern {
+type CachedSearchPatternsResult struct {
+	CacheSearchesDisabled bool
+	CachedPatternsResult  CachedPatternsResult
+}
+
+func (r *ReadThroughSegment) CachedSearchPatterns() ([]CachedPattern, CachedSearchPatternsResult) {
 	cache := r.caches.SearchPostingsListCache
 	if cache == nil || !r.opts.CacheSearches {
-		return nil
+		return nil, CachedSearchPatternsResult{
+			CacheSearchesDisabled: true,
+		}
 	}
 
-	return cache.CachedPatterns(r.uuid, PatternTypeSearch)
+	patterns, result := cache.CachedPatterns(r.uuid, PatternTypeSearch)
+	return patterns, CachedSearchPatternsResult{
+		CachedPatternsResult: result,
+	}
 }
 
 var _ search.ReadThroughSegmentSearcher = (*readThroughSegmentReader)(nil)
@@ -369,9 +379,6 @@ func (s *readThroughSegmentReader) Search(
 		return nil, err
 	}
 
-	// Only cache the second time seen a recent query since
-	// copying the postings lists into a roaring postings list
-	// can be expensive (in PutSearch).
 	cache.PutSearch(s.uuid, queryStr, query, pl)
 
 	return pl, nil
diff --git a/src/dbnode/storage/index/types.go b/src/dbnode/storage/index/types.go
index afd72fbd22..7860cebd5c 100644
--- a/src/dbnode/storage/index/types.go
+++ b/src/dbnode/storage/index/types.go
@@ -446,7 +446,7 @@ type Block interface {
 
 	// RotateColdMutableSegments rotates the currently active cold mutable segment out for a
 	// new cold mutable segment to write to.
-	RotateColdMutableSegments()
+	RotateColdMutableSegments() error
 
 	// MemorySegmentsData returns all in memory segments data.
 	MemorySegmentsData(ctx context.Context) ([]fst.SegmentData, error)

From 1f8a5101e21d5fef77ca56f638456dfe8ab719a3 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 6 Apr 2021 18:21:36 -0400
Subject: [PATCH 095/106] Initialize worker pool

---
 src/dbnode/storage/index/block.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index a1ebcd2330..414157f773 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -236,6 +236,8 @@ func NewBlock(
 		return nil, err
 	}
 
+	cachedSearchesWorkers.Init()
+
 	segs, err := newMutableSegments(
 		md,
 		blockStart,

From 9cf50ee8dbbdcfc3165ba669055941d41bc7700f Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 10 Apr 2021 09:03:03 -0400
Subject: [PATCH 096/106] Reuse cached postings when populating cached searches

---
 go.sum                                        |   5 +-
 src/dbnode/storage/index/block.go             |  12 +-
 .../storage/index/compaction/compactor.go     |  27 +-
 src/dbnode/storage/index/mutable_segments.go  | 394 ++++++++++++++----
 .../storage/index/postings_list_cache.go      |  81 ++--
 .../storage/index/read_through_segment.go     |  10 +-
 .../segment/builder/multi_segments_builder.go |  20 +
 .../builder/multi_segments_terms_iter.go      |   2 +-
 src/m3ninx/index/segment/types.go             |  19 +
 .../postings/roaring/bitmap_multi_readonly.go |   7 +
 src/m3ninx/search/query/codec.go              |  11 +-
 src/x/sync/types.go                           |   3 +
 src/x/sync/worker_pool.go                     |   4 +
 13 files changed, 457 insertions(+), 138 deletions(-)

diff --git a/go.sum b/go.sum
index 9e1670688f..4acab0b66f 100644
--- a/go.sum
+++ b/go.sum
@@ -188,10 +188,10 @@ github.com/daviddengcn/go-assert v0.0.0-20150305222929-ba7e68aeeff6 h1:OPIYL/VhQ
 github.com/daviddengcn/go-assert v0.0.0-20150305222929-ba7e68aeeff6/go.mod h1:N+OekMaElW3rSAfDdNX6Dff3HS237/OhC08jYFW4oCw=
 github.com/daviddengcn/go-villa v0.0.0-20160111144444-3f35da8ba982 h1:2Trx4ntMtxmus9nN2w1PIqJOI8jB3RjlnDnFm/ImlIU=
 github.com/daviddengcn/go-villa v0.0.0-20160111144444-3f35da8ba982/go.mod h1:U8xNoHcXfPnZzy9zCxeKRjaJgC1d3613rFHjZVVAqKc=
-github.com/dgraph-io/ristretto v0.0.3 h1:jh22xisGBjrEVnRZ1DVTpBVQm0Xndu8sMl0CWDzSIBI=
-github.com/dgraph-io/ristretto v0.0.3/go.mod h1:KPxhHT9ZxKefz+PCeOGsrHpl1qZ7i70dGTu2u+Ahh6E=
 github.com/denis-tingajkin/go-header v0.3.1 h1:ymEpSiFjeItCy1FOP+x0M2KdCELdEAHUsNa8F+hHc6w=
 github.com/denis-tingajkin/go-header v0.3.1/go.mod h1:sq/2IxMhaZX+RRcgHfCRx/m0M5na0fBt4/CRe7Lrji0=
+github.com/dgraph-io/ristretto v0.0.3 h1:jh22xisGBjrEVnRZ1DVTpBVQm0Xndu8sMl0CWDzSIBI=
+github.com/dgraph-io/ristretto v0.0.3/go.mod h1:KPxhHT9ZxKefz+PCeOGsrHpl1qZ7i70dGTu2u+Ahh6E=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
 github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw=
@@ -1200,6 +1200,7 @@ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e h1:vcxGaoTs7kV8m5Np9uUNQin4
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208 h1:qwRHBd0NqMbJxfbotnDhm2ByMI1Shq4Y6oRJo21SGJA=
 golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cOontH8FOep7tGV86Y7SQ=
 golang.org/x/sys v0.0.0-20200826173525-f9321e4c35a6 h1:DvY3Zkh7KabQE/kfzMvYvKirSiguP9Q/veMtkYyf0o8=
 golang.org/x/sys v0.0.0-20200826173525-f9321e4c35a6/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/src/dbnode/storage/index/block.go b/src/dbnode/storage/index/block.go
index 414157f773..58f293a4ca 100644
--- a/src/dbnode/storage/index/block.go
+++ b/src/dbnode/storage/index/block.go
@@ -148,7 +148,7 @@ type block struct {
 	queryLimits                     limits.QueryLimits
 	docsLimit                       limits.LookbackLimit
 	querySegmentsWorkers            xsync.WorkerPool
-	cachedSearchesWorkers           xsync.PooledWorkerPool
+	cachedSearchesWorkers           xsync.WorkerPool
 
 	metrics blockMetrics
 	logger  *zap.Logger
@@ -227,15 +227,7 @@ func NewBlock(
 	iopts = iopts.SetMetricsScope(scope)
 
 	cpus := int(math.Max(1, math.Ceil(0.25*float64(runtime.NumCPU()))))
-	poolOpts := xsync.NewPooledWorkerPoolOptions().
-		SetGrowOnDemand(false).
-		SetNumShards(1).
-		SetInstrumentOptions(iopts.SetMetricsScope(iopts.MetricsScope().SubScope("cached-searches")))
-	cachedSearchesWorkers, err := xsync.NewPooledWorkerPool(cpus, poolOpts)
-	if err != nil {
-		return nil, err
-	}
-
+	cachedSearchesWorkers := xsync.NewWorkerPool(cpus)
 	cachedSearchesWorkers.Init()
 
 	segs, err := newMutableSegments(
diff --git a/src/dbnode/storage/index/compaction/compactor.go b/src/dbnode/storage/index/compaction/compactor.go
index af55b4158b..2c3bd309f9 100644
--- a/src/dbnode/storage/index/compaction/compactor.go
+++ b/src/dbnode/storage/index/compaction/compactor.go
@@ -101,6 +101,12 @@ func NewCompactor(
 	}, nil
 }
 
+// CompactResult is the result of a call to compact.
+type CompactResult struct {
+	Compacted        fst.Segment
+	SegmentMetadatas []segment.SegmentsBuilderSegmentMetadata
+}
+
 // Compact will take a set of segments and compact them into an immutable
 // FST segment, if there is a single mutable segment it can directly be
 // converted into an FST segment, otherwise an intermediary mutable segment
@@ -113,21 +119,34 @@ func (c *Compactor) Compact(
 	filter segment.DocumentsFilter,
 	filterCounter tally.Counter,
 	reporterOptions mmap.ReporterOptions,
-) (fst.Segment, error) {
+) (CompactResult, error) {
 	c.Lock()
 	defer c.Unlock()
 
 	if c.closed {
-		return nil, errCompactorClosed
+		return CompactResult{}, errCompactorClosed
 	}
 
 	c.builder.Reset()
 	c.builder.SetFilter(filter, filterCounter)
 	if err := c.builder.AddSegments(segs); err != nil {
-		return nil, err
+		return CompactResult{}, err
+	}
+
+	metas, err := c.builder.SegmentMetadatas()
+	if err != nil {
+		return CompactResult{}, err
+	}
+
+	compacted, err := c.compactFromBuilderWithLock(c.builder, reporterOptions)
+	if err != nil {
+		return CompactResult{}, err
 	}
 
-	return c.compactFromBuilderWithLock(c.builder, reporterOptions)
+	return CompactResult{
+		Compacted:        compacted,
+		SegmentMetadatas: metas,
+	}, nil
 }
 
 // CompactUsingBuilder compacts segments together using a provided segment builder.
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 15e82209b4..79fa11a566 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -23,6 +23,7 @@ package index
 import (
 	"errors"
 	"fmt"
+	"io"
 	"math"
 	"runtime"
 	"sync"
@@ -32,19 +33,29 @@ import (
 	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
 	"github.com/m3db/m3/src/dbnode/storage/index/segments"
 	"github.com/m3db/m3/src/m3ninx/doc"
+	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	m3ninxindex "github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/index/segment"
 	"github.com/m3db/m3/src/m3ninx/index/segment/builder"
 	"github.com/m3db/m3/src/m3ninx/index/segment/fst"
+	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/pilosa"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
+	"github.com/m3db/m3/src/m3ninx/search"
+	"github.com/m3db/m3/src/m3ninx/search/query"
+	"github.com/m3db/m3/src/m3ninx/x"
 	"github.com/m3db/m3/src/x/context"
+	xerrors "github.com/m3db/m3/src/x/errors"
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	xresource "github.com/m3db/m3/src/x/resource"
 	xsync "github.com/m3db/m3/src/x/sync"
 	xtime "github.com/m3db/m3/src/x/time"
+	bitmap "github.com/m3dbx/pilosa/roaring"
 
 	"github.com/uber-go/tally"
 	"go.uber.org/zap"
+	"golang.org/x/sync/errgroup"
 )
 
 var (
@@ -82,7 +93,7 @@ type mutableSegments struct {
 	iopts                    instrument.Options
 	optsListener             xresource.SimpleCloser
 	writeIndexingConcurrency int
-	cachedSearchesWorkers    xsync.PooledWorkerPool
+	cachedSearchesWorkers    xsync.WorkerPool
 
 	sealedBlockStarts          map[xtime.UnixNano]struct{}
 	backgroundCompactGCPending bool
@@ -115,20 +126,23 @@ func (f *indexedBloomFilter) Write(id []byte) {
 }
 
 type mutableSegmentsMetrics struct {
-	foregroundCompactionPlanRunLatency                   tally.Timer
-	foregroundCompactionTaskRunLatency                   tally.Timer
-	backgroundCompactionPlanRunLatency                   tally.Timer
-	backgroundCompactionTaskRunLatency                   tally.Timer
-	activeBlockIndexNew                                  tally.Counter
-	activeBlockGarbageCollectSegment                     tally.Counter
-	activeBlockGarbageCollectSeries                      tally.Counter
-	activeBlockGarbageCollectEmptySegment                tally.Counter
-	activeBlockGarbageCollectCachedSearchesDisabled      tally.Counter
-	activeBlockGarbageCollectCachedSearchesInRegistry    tally.Counter
-	activeBlockGarbageCollectCachedSearchesNotInRegistry tally.Counter
-	activeBlockGarbageCollectCachedSearchesTotal         tally.Histogram
-	activeBlockGarbageCollectCachedSearchesMatched       tally.Histogram
-	activeBlockGarbageCollectRerunCachedSearch           tally.Counter
+	foregroundCompactionPlanRunLatency                        tally.Timer
+	foregroundCompactionTaskRunLatency                        tally.Timer
+	backgroundCompactionPlanRunLatency                        tally.Timer
+	backgroundCompactionTaskRunLatency                        tally.Timer
+	activeBlockIndexNew                                       tally.Counter
+	activeBlockGarbageCollectSegment                          tally.Counter
+	activeBlockGarbageCollectSeries                           tally.Counter
+	activeBlockGarbageCollectEmptySegment                     tally.Counter
+	activeBlockGarbageCollectCachedSearchesDisabled           tally.Counter
+	activeBlockGarbageCollectCachedSearchesInRegistry         tally.Counter
+	activeBlockGarbageCollectCachedSearchesNotInRegistry      tally.Counter
+	activeBlockGarbageCollectCachedSearchesTotal              tally.Histogram
+	activeBlockGarbageCollectCachedSearchesMatched            tally.Histogram
+	activeBlockGarbageCollectReconstructCachedSearchSuccess   tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchError     tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchCacheHit  tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchCacheMiss tally.Counter
 }
 
 func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
@@ -157,7 +171,18 @@ func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
 			append(tally.ValueBuckets{0, 1}, tally.MustMakeExponentialValueBuckets(2, 2, 12)...)),
 		activeBlockGarbageCollectCachedSearchesMatched: backgroundScope.Histogram("gc-cached-searches-matched",
 			append(tally.ValueBuckets{0, 1}, tally.MustMakeExponentialValueBuckets(2, 2, 12)...)),
-		activeBlockGarbageCollectRerunCachedSearch: backgroundScope.Counter("gc-rerun-cached-search"),
+		activeBlockGarbageCollectReconstructCachedSearchSuccess: backgroundScope.Tagged(map[string]string{
+			"result_type": "success",
+		}).Counter("gc-reconstruct-cached-search"),
+		activeBlockGarbageCollectReconstructCachedSearchError: backgroundScope.Tagged(map[string]string{
+			"result_type": "error",
+		}).Counter("gc-reconstruct-cached-search"),
+		activeBlockGarbageCollectReconstructCachedSearchCacheHit: backgroundScope.Tagged(map[string]string{
+			"result_type": "cache_hit",
+		}).Counter("gc-reconstruct-cached-search-cache-result"),
+		activeBlockGarbageCollectReconstructCachedSearchCacheMiss: backgroundScope.Tagged(map[string]string{
+			"result_type": "cache_miss",
+		}).Counter("gc-reconstruct-cached-search-cache-result"),
 	}
 }
 
@@ -168,7 +193,7 @@ func newMutableSegments(
 	blockStart time.Time,
 	opts Options,
 	blockOpts BlockOptions,
-	cachedSearchesWorkers xsync.PooledWorkerPool,
+	cachedSearchesWorkers xsync.WorkerPool,
 	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
 	iopts instrument.Options,
 ) (*mutableSegments, error) {
@@ -731,7 +756,7 @@ func (m *mutableSegments) backgroundCompactWithTask(
 	}
 
 	start := time.Now()
-	compacted, err := compactor.Compact(segments, documentsFilter,
+	compactResult, err := compactor.Compact(segments, documentsFilter,
 		m.metrics.activeBlockGarbageCollectSeries,
 		mmap.ReporterOptions{
 			Context: mmap.Context{
@@ -757,7 +782,11 @@ func (m *mutableSegments) backgroundCompactWithTask(
 		return err
 	}
 
-	var replaceSeg segment.Segment
+	var (
+		compacted  = compactResult.Compacted
+		segMetas   = compactResult.SegmentMetadatas
+		replaceSeg segment.Segment
+	)
 	if empty {
 		m.metrics.activeBlockGarbageCollectEmptySegment.Inc(1)
 	} else {
@@ -773,87 +802,294 @@ func (m *mutableSegments) backgroundCompactWithTask(
 		// Note: There was very obvious peaks of latency (p99 of <500ms spiking
 		// to 8 times that at first replace of large segments after a block
 		// rotation) without this optimization.
-		for _, segment := range segments {
-			prevReadThroughSeg, ok := segment.(*ReadThroughSegment)
+		if err := m.populateCachedSearches(readThroughSeg, segMetas); err != nil {
+			instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
+				l.Error("failed to populate cached searches", zap.Error(err))
+			})
+		}
+	}
+
+	// Rotate out the replaced frozen segments and add the compacted one.
+	m.Lock()
+	defer m.Unlock()
+
+	result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments,
+		segments, replaceSeg)
+	m.backgroundSegments = result
+
+	return nil
+}
+
+type cachedPatternForCompactedSegment struct {
+	field       string
+	searchQuery *querypb.Query
+	patterns    []cachedPatternFromSegment
+}
+
+type cachedPatternFromSegment struct {
+	prevSeg          prevSegment
+	hasCachedPattern bool
+	cachedPattern    CachedPattern
+}
+
+type prevSegment struct {
+	segment *ReadThroughSegment
+	meta    segment.SegmentsBuilderSegmentMetadata
+}
+
+const (
+	defaultBitmapContainerPooling = 128
+)
+
+type populateCachedSearchesWorker struct {
+	postings      postings.MutableList
+	encoder       *pilosa.Encoder
+	closers       []x.SafeCloser
+	cachedClosers []x.SafeCloser
+}
+
+func newPopulateCachedSearchesWorker() *populateCachedSearchesWorker {
+	b := bitmap.NewBitmapWithDefaultPooling(defaultBitmapContainerPooling)
+	return &populateCachedSearchesWorker{
+		postings: roaring.NewPostingsListFromBitmap(b),
+		encoder:  pilosa.NewEncoder(),
+	}
+}
+
+func (w *populateCachedSearchesWorker) addCloser(c io.Closer) {
+	n := len(w.cachedClosers)
+	if n > 0 {
+		last := w.cachedClosers[n-1]
+		last.Reset(c)
+		w.cachedClosers[n-1] = nil
+		w.cachedClosers = w.cachedClosers[:n-1]
+		w.closers = append(w.closers, last)
+		return
+	}
+	w.closers = append(w.closers, x.NewSafeCloser(c))
+}
+
+func (w *populateCachedSearchesWorker) close() error {
+	multiErr := xerrors.NewMultiError()
+	for i, c := range w.closers {
+		multiErr = multiErr.Add(c.Close())
+		w.closers[i] = nil
+		c.Reset(nil)
+		w.cachedClosers = append(w.cachedClosers, c)
+	}
+	w.closers = w.closers[:0]
+	return multiErr.FinalError()
+}
+
+func (m *mutableSegments) populateCachedSearches(
+	compactedSeg *ReadThroughSegment,
+	prevSegsMetas []segment.SegmentsBuilderSegmentMetadata,
+) error {
+	prevSegs := make([]prevSegment, 0, len(prevSegsMetas))
+	for _, segMeta := range prevSegsMetas {
+		prevReadThroughSeg, ok := segMeta.Segment.(*ReadThroughSegment)
+		if !ok {
+			return fmt.Errorf("failed to cast compacted segment to read through segment")
+		}
+		prevSegs = append(prevSegs, prevSegment{
+			segment: prevReadThroughSeg,
+			meta:    segMeta,
+		})
+	}
+
+	searches := make(map[string]cachedPatternForCompactedSegment)
+	for i, seg := range prevSegs {
+		result := seg.segment.CachedSearchPatterns(func(p CachedPattern) {
+			pattern, ok := searches[p.SearchQueryKey]
 			if !ok {
-				instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-					l.Error("failed to cast compacted segment to read through segment")
-				})
+				pattern = cachedPatternForCompactedSegment{
+					searchQuery: p.SearchQuery,
+					patterns:    make([]cachedPatternFromSegment, len(prevSegs)),
+				}
+				for j, prevSeg := range prevSegs {
+					pattern.patterns[j] = cachedPatternFromSegment{
+						prevSeg: prevSeg,
+					}
+				}
+				searches[p.SearchQueryKey] = pattern
+			}
+			// Mark this segment with the cached pattern.
+			pattern.patterns[i].hasCachedPattern = true
+			pattern.patterns[i].cachedPattern = p
+		})
+		if result.CacheSearchesDisabled {
+			m.metrics.activeBlockGarbageCollectCachedSearchesDisabled.Inc(1)
+		}
+		if result.CachedPatternsResult.InRegistry {
+			m.metrics.activeBlockGarbageCollectCachedSearchesInRegistry.Inc(1)
+		} else {
+			m.metrics.activeBlockGarbageCollectCachedSearchesNotInRegistry.Inc(1)
+		}
+		total := float64(result.CachedPatternsResult.TotalPatterns)
+		m.metrics.activeBlockGarbageCollectCachedSearchesTotal.RecordValue(total)
+		matched := float64(result.CachedPatternsResult.MatchedPatterns)
+		m.metrics.activeBlockGarbageCollectCachedSearchesMatched.RecordValue(matched)
+	}
+
+	var totalSegmentsSize int64
+	for _, seg := range prevSegs {
+		totalSegmentsSize += seg.segment.Size()
+	}
+
+	var (
+		numWorkers       = m.cachedSearchesWorkers.Size()
+		workers          = make(chan *populateCachedSearchesWorker, numWorkers)
+		instrumentedExec = func(fn func() error) func() error {
+			return func() error {
+				e := fn()
+				if e != nil {
+					m.metrics.activeBlockGarbageCollectReconstructCachedSearchError.Inc(1)
+					return e
+				}
+				m.metrics.activeBlockGarbageCollectReconstructCachedSearchSuccess.Inc(1)
+				return nil
+			}
+		}
+		group errgroup.Group
+	)
+	for i := 0; i < numWorkers; i++ {
+		workers <- newPopulateCachedSearchesWorker()
+	}
+	for _, s := range searches {
+		s := s // Capture for loop.
+
+		var totalSegmentsHasPatternSize int64
+		for i := range s.patterns {
+			if !s.patterns[i].hasCachedPattern {
 				continue
 			}
+			totalSegmentsHasPatternSize += s.patterns[i].prevSeg.segment.Size()
+		}
 
-			searches, result := prevReadThroughSeg.CachedSearchPatterns()
-			if result.CacheSearchesDisabled {
-				m.metrics.activeBlockGarbageCollectCachedSearchesDisabled.Inc(1)
+		if totalSegmentsHasPatternSize < totalSegmentsSize/2 {
+			// If segments that represent less than half total size of all
+			// segments we compacted together do not have the cached pattern
+			// then don't bother caching this since it could result in an
+			// expensive computation and we're not getting the benefit from
+			// running the computation anyway since these aren't searches
+			// that were cached in the largest segments we just compacted.
+			continue
+		}
+
+		// Control concurrency by taking and returning token from worker pool.
+		w := <-workers
+		group.Go(instrumentedExec(func() error {
+			defer func() {
+				// Close anything needed to be closed.
+				_ = w.close()
+				// Return worker.
+				workers <- w
+			}()
+
+			if s.searchQuery == nil {
+				return fmt.Errorf("no search query for cached search pattern")
 			}
-			if result.CachedPatternsResult.InRegistry {
-				m.metrics.activeBlockGarbageCollectCachedSearchesInRegistry.Inc(1)
-			} else {
-				m.metrics.activeBlockGarbageCollectCachedSearchesNotInRegistry.Inc(1)
+
+			var searcher search.Searcher
+			search, err := query.UnmarshalProto(s.searchQuery)
+			if err != nil {
+				return fmt.Errorf("failed to unmarshal search for cached search pattern: %w", err)
 			}
-			total := float64(result.CachedPatternsResult.TotalPatterns)
-			m.metrics.activeBlockGarbageCollectCachedSearchesTotal.RecordValue(total)
-			matched := float64(len(searches))
-			m.metrics.activeBlockGarbageCollectCachedSearchesMatched.RecordValue(matched)
-			for _, s := range searches {
-				s := s // Capture for loop.
-				m.cachedSearchesWorkers.Go(func() {
-					r, err := readThroughSeg.Reader()
-					if err != nil {
-						instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-							l.Error("failed to create read through segment reader")
-						})
-						return
-					}
 
-					defer func() {
-						if err := r.Close(); err != nil {
-							instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-								l.Error("failed to close read through segment reader")
-							})
+			// Reset reused postings.
+			w.postings.Reset()
+
+			for i := range s.patterns {
+				var iter postings.Iterator
+				if s.patterns[i].hasCachedPattern {
+					// If has the cached pattern, no need to execute a search, can reuse
+					// the postings iterator from the cache.
+					iter = s.patterns[i].cachedPattern.Postings.Iterator()
+					w.addCloser(iter)
+
+					// Track hit.
+					m.metrics.activeBlockGarbageCollectReconstructCachedSearchCacheHit.Inc(1)
+				} else {
+					// Does not have this pattern cached, need to execute the search
+					// against this segment.
+					if searcher == nil {
+						searcher, err = search.Searcher()
+						if err != nil {
+							return fmt.Errorf("failed to create searcher for cached search pattern: %w", err)
 						}
-					}()
-
-					if s.SearchQuery == nil {
-						instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-							l.Error("no search query for cached search pattern")
-						})
-						return
 					}
 
-					searcher, err := s.SearchQuery.Searcher()
+					reader, err := s.patterns[i].prevSeg.segment.Reader()
 					if err != nil {
-						instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-							l.Error("failed to create searcher from cached search pattern")
-						})
-						return
+						return fmt.Errorf("failed to create prev seg reader: %w", err)
 					}
 
-					pl, err := searcher.Search(r)
+					w.addCloser(reader)
+
+					pl, err := searcher.Search(reader)
 					if err != nil {
-						instrument.EmitAndLogInvariantViolation(m.iopts, func(l *zap.Logger) {
-							l.Error("failed to create searcher from cached search pattern")
-						})
-						return
+						return fmt.Errorf("failed to search prev seg reader: %w", err)
 					}
 
-					readThroughSeg.PutCachedSearchPattern(s.Field, s.SearchQuery, pl)
-					m.metrics.activeBlockGarbageCollectRerunCachedSearch.Inc(1)
-				})
+					iter = pl.Iterator()
+					w.addCloser(iter)
+
+					// Track miss.
+					m.metrics.activeBlockGarbageCollectReconstructCachedSearchCacheMiss.Inc(1)
+				}
 
+				if s.patterns[i].prevSeg.meta.Offset == 0 && s.patterns[i].prevSeg.meta.Skips == 0 {
+					// No offset and no postings to skip, can copy into the reused postings.
+					if err := w.postings.AddIterator(iter); err != nil {
+						return fmt.Errorf("could not copy cached postings: %w", err)
+					}
+					continue
+				}
+
+				// We have to take into account offset and duplicates/skips.
+				negativeOffsets := s.patterns[i].prevSeg.meta.NegativeOffsets
+				for iter.Next() {
+					curr := iter.Current()
+					negativeOffset := negativeOffsets[curr]
+					// Then skip the individual if matches.
+					if negativeOffset == -1 {
+						// Skip this value, as itself is a duplicate.
+						continue
+					}
+					value := curr + s.patterns[i].prevSeg.meta.Offset - postings.ID(negativeOffset)
+					if err := w.postings.Insert(value); err != nil {
+						iter.Close()
+						return fmt.Errorf("could not insert from cached postings: %w", err)
+					}
+				}
+
+				err := iter.Err()
+				iter.Close()
+				if err != nil {
+					return fmt.Errorf("could not close cached postings: %w", err)
+				}
 			}
-		}
-	}
 
-	// Rotate out the replaced frozen segments and add the compacted one.
-	m.Lock()
-	defer m.Unlock()
+			// Encode the result and create a read only copy as we reuse the postings
+			// list in the worker to build consequent postings.
+			bytes, err := w.encoder.Encode(w.postings)
+			if err != nil {
+				return fmt.Errorf("could not encode result cached search postings: %w", err)
+			}
 
-	result := m.addCompactedSegmentFromSegmentsWithLock(m.backgroundSegments,
-		segments, replaceSeg)
-	m.backgroundSegments = result
+			// Need to copy bytes since encoder owns the bytes.
+			copied := append(make([]byte, 0, len(bytes)), bytes...)
+			readOnlyPostings, err := roaring.NewReadOnlyBitmap(copied)
+			if err != nil {
+				return fmt.Errorf("could not create result cached search postings: %w", err)
+			}
 
-	return nil
+			compactedSeg.PutCachedSearchPattern(s.field, search, readOnlyPostings)
+			return nil
+		}))
+	}
+
+	return group.Wait()
 }
 
 func (m *mutableSegments) addCompactedSegmentFromSegmentsWithLock(
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index 2098d32899..762d4357e9 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -26,6 +26,7 @@ import (
 	"sync"
 	"time"
 
+	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -97,14 +98,19 @@ type PostingsListCache struct {
 type postingsListCacheRegistry struct {
 	sync.RWMutex
 	eventCh chan postingsListEvent
-	active  map[uuid.Array]map[registryKey]postings.List
+	active  map[uuid.Array]map[registryKey]registryValue
 }
 
 type registryKey struct {
-	field       string
-	pattern     string
-	patternType PatternType
-	searchQuery search.Query
+	field          string
+	pattern        string
+	patternType    PatternType
+	searchQueryKey string
+}
+
+type registryValue struct {
+	searchQuery *querypb.Query
+	postings    postings.List
 }
 
 type postingsListEventType int
@@ -134,7 +140,7 @@ func NewPostingsListCache(
 		opts: opts,
 		registry: postingsListCacheRegistry{
 			eventCh: make(chan postingsListEvent, 4096),
-			active:  make(map[uuid.Array]map[registryKey]postings.List),
+			active:  make(map[uuid.Array]map[registryKey]registryValue),
 		},
 		metrics: newPostingsListCacheMetrics(opts.InstrumentOptions.MetricsScope()),
 		logger:  opts.InstrumentOptions.Logger(),
@@ -233,11 +239,13 @@ type cachedPostings struct {
 	field       string
 	pattern     string
 	patternType PatternType
-	// searchQuery is only set for search queries.
-	searchQuery search.Query
 
 	// value
 	postings postings.List
+	// searchQueryKey is only set for search queries.
+	searchQueryKey string
+	// searchQuery is only set for search queries.
+	searchQuery *querypb.Query
 }
 
 func keyHash(
@@ -291,10 +299,10 @@ func (q *PostingsListCache) PutSearch(
 	query search.Query,
 	pl postings.List,
 ) {
-	if roaring.IsReadOnlyPostingsList(pl) {
+	if roaring.IsComplexReadOnlyPostingsList(pl) {
 		// Copy into mutable postings list since it's expensive to read from
-		// a read only postings list over and over again (it's lazily
-		// evaluated from for allocation purposes).
+		// a complex read only postings list over and over again (it's lazily
+		// evaluated over many individual bitmaps for allocation purposes).
 		mutable := q.opts.PostingsListPool.Get()
 		if err := mutable.AddIterator(pl.Iterator()); err != nil {
 			q.metrics.pooledGetErrAddIter.Inc(1)
@@ -321,7 +329,7 @@ func (q *PostingsListCache) put(
 		field:       field,
 		pattern:     pattern,
 		patternType: patternType,
-		searchQuery: searchQuery,
+		searchQuery: searchQuery.ToProto(),
 		postings:    pl,
 	}
 	q.lru.Set(key, value, 1)
@@ -382,11 +390,12 @@ func (q *PostingsListCache) startLoop() Closer {
 }
 
 type CachedPattern struct {
-	Field       string
-	Pattern     string
-	PatternType PatternType
-	SearchQuery search.Query
-	Postings    postings.List
+	Field          string
+	Pattern        string
+	PatternType    PatternType
+	SearchQueryKey string
+	SearchQuery    *querypb.Query
+	Postings       postings.List
 }
 
 type CachedPatternsResult struct {
@@ -395,10 +404,13 @@ type CachedPatternsResult struct {
 	MatchedPatterns int
 }
 
+type CachedPatternForEachFn func(CachedPattern)
+
 func (q *PostingsListCache) CachedPatterns(
 	uuid uuid.UUID,
 	patternType PatternType,
-) ([]CachedPattern, CachedPatternsResult) {
+	fn CachedPatternForEachFn,
+) CachedPatternsResult {
 	var result CachedPatternsResult
 
 	q.registry.RLock()
@@ -406,7 +418,7 @@ func (q *PostingsListCache) CachedPatterns(
 
 	segmentPostings, ok := q.registry.active[uuid.Array()]
 	if !ok {
-		return nil, result
+		return result
 	}
 
 	result.InRegistry = true
@@ -416,37 +428,40 @@ func (q *PostingsListCache) CachedPatterns(
 		}
 	}
 	if result.TotalPatterns == 0 {
-		return nil, CachedPatternsResult{}
+		return CachedPatternsResult{}
 	}
 
-	patterns := make([]CachedPattern, 0, result.TotalPatterns)
 	for key, value := range segmentPostings {
 		if patternType == key.patternType {
-			patterns = append(patterns, CachedPattern{
-				Field:       key.field,
-				Pattern:     key.pattern,
-				PatternType: key.patternType,
-				SearchQuery: key.searchQuery,
-				Postings:    value,
+			fn(CachedPattern{
+				Field:          key.field,
+				Pattern:        key.pattern,
+				PatternType:    key.patternType,
+				SearchQueryKey: key.searchQueryKey,
+				SearchQuery:    value.searchQuery,
+				Postings:       value.postings,
 			})
 			result.MatchedPatterns++
 		}
 	}
 
-	return patterns, result
+	return result
 }
 
 func (q *PostingsListCache) processEventWithLock(ev postingsListEvent) {
 	uuid := ev.cachedPostings.segmentUUID.Array()
 	key := registryKey{
-		field:       ev.cachedPostings.field,
-		pattern:     ev.cachedPostings.pattern,
-		patternType: ev.cachedPostings.patternType,
+		field:          ev.cachedPostings.field,
+		pattern:        ev.cachedPostings.pattern,
+		patternType:    ev.cachedPostings.patternType,
+		searchQueryKey: ev.cachedPostings.searchQueryKey,
+	}
+	value := registryValue{
 		searchQuery: ev.cachedPostings.searchQuery,
 	}
 	segmentPostings, ok := q.registry.active[uuid]
 	if !ok {
-		segmentPostings = make(map[registryKey]postings.List)
+		segmentPostings = make(map[registryKey]registryValue)
 		q.registry.active[uuid] = segmentPostings
 	}
 
@@ -457,7 +472,7 @@ func (q *PostingsListCache) processEventWithLock(ev postingsListEvent) {
 			delete(q.registry.active, uuid)
 		}
 	case addEventType:
-		segmentPostings[key] = ev.cachedPostings.postings
+		segmentPostings[key] = value
 	}
 }
 
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index d858f9b061..93e2e173ec 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -179,16 +179,18 @@ type CachedSearchPatternsResult struct {
 	CachedPatternsResult  CachedPatternsResult
 }
 
-func (r *ReadThroughSegment) CachedSearchPatterns() ([]CachedPattern, CachedSearchPatternsResult) {
+func (r *ReadThroughSegment) CachedSearchPatterns(
+	fn CachedPatternForEachFn,
+) CachedSearchPatternsResult {
 	cache := r.caches.SearchPostingsListCache
 	if cache == nil || !r.opts.CacheSearches {
-		return nil, CachedSearchPatternsResult{
+		return CachedSearchPatternsResult{
 			CacheSearchesDisabled: true,
 		}
 	}
 
-	patterns, result := cache.CachedPatterns(r.uuid, PatternTypeSearch)
-	return patterns, CachedSearchPatternsResult{
+	result := cache.CachedPatterns(r.uuid, PatternTypeSearch, fn)
+	return CachedSearchPatternsResult{
 		CachedPatternsResult: result,
 	}
 }
diff --git a/src/m3ninx/index/segment/builder/multi_segments_builder.go b/src/m3ninx/index/segment/builder/multi_segments_builder.go
index 262ffdd01a..536d8ced13 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_builder.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_builder.go
@@ -21,6 +21,7 @@
 package builder
 
 import (
+	"fmt"
 	"io"
 	"sort"
 
@@ -188,6 +189,25 @@ func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
 	return nil
 }
 
+func (b *builderFromSegments) SegmentMetadatas() ([]segment.SegmentsBuilderSegmentMetadata, error) {
+	n := len(b.segments)
+	if n < 1 {
+		return nil, fmt.Errorf("segments empty: length=%d", n)
+	}
+
+	result := make([]segment.SegmentsBuilderSegmentMetadata, 0, n)
+	for _, s := range b.segments {
+		result = append(result, segment.SegmentsBuilderSegmentMetadata{
+			Segment:         s.segment,
+			Offset:          s.offset,
+			NegativeOffsets: s.negativeOffsets,
+			Skips:           s.skips,
+		})
+	}
+
+	return result, nil
+}
+
 func (b *builderFromSegments) Docs() []doc.Metadata {
 	return b.docs
 }
diff --git a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
index 86b1b03046..564bf4d186 100644
--- a/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
+++ b/src/m3ninx/index/segment/builder/multi_segments_terms_iter.go
@@ -153,7 +153,7 @@ func (i *termsIterFromSegments) Next() bool {
 			continue
 		}
 
-		// We have to taken into account the offset and duplicates
+		// We have to take into account offset and duplicates/skips.
 		var (
 			iter            = list.Iterator()
 			negativeOffsets = termsKeyIter.segment.negativeOffsets
diff --git a/src/m3ninx/index/segment/types.go b/src/m3ninx/index/segment/types.go
index 8e25bd536c..75fcb08309 100644
--- a/src/m3ninx/index/segment/types.go
+++ b/src/m3ninx/index/segment/types.go
@@ -228,6 +228,25 @@ type SegmentsBuilder interface {
 
 	// AddSegments adds segments to build from.
 	AddSegments(segments []Segment) error
+
+	// SegmentMetadatas returns the segment builder segment metadata.
+	SegmentMetadatas() ([]SegmentsBuilderSegmentMetadata, error)
+}
+
+// SegmentsBuilderSegmentMetadata is a set of metadata about a segment
+// that was used to build a compacted segment.
+type SegmentsBuilderSegmentMetadata struct {
+	Segment Segment
+	Offset  postings.ID
+	// NegativeOffsets is a lookup of document IDs are duplicates or should be skipped,
+	// that is documents that are already contained by other segments or should
+	// not be included in the output segment and hence should not be returned
+	// when looking up documents. If this is the case offset is -1.
+	// If a document ID is not a duplicate or skipped then the offset is
+	// the shift that should be applied when translating this postings ID
+	// to the result postings ID.
+	NegativeOffsets []int64
+	Skips           int64
 }
 
 // DocumentsFilter is a documents filter.
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index 2f6103f76e..d2cd79318c 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -85,6 +85,13 @@ func IsReadOnlyPostingsList(pl postings.List) bool {
 	return ok
 }
 
+// IsComplexReadOnlyPostingsList returns whether a postings list is a complex
+// read only bitmap derived from other bitmaps or not.
+func IsComplexReadOnlyPostingsList(pl postings.List) bool {
+	_, ok := pl.(*multiBitmap)
+	return ok
+}
+
 var _ postings.List = (*multiBitmap)(nil)
 var _ readOnlyIterable = (*multiBitmap)(nil)
 
diff --git a/src/m3ninx/search/query/codec.go b/src/m3ninx/search/query/codec.go
index d7fc0a9a3f..806f39f91c 100644
--- a/src/m3ninx/search/query/codec.go
+++ b/src/m3ninx/search/query/codec.go
@@ -45,10 +45,11 @@ func Unmarshal(data []byte) (search.Query, error) {
 		return nil, err
 	}
 
-	return unmarshal(&pb)
+	return UnmarshalProto(&pb)
 }
 
-func unmarshal(q *querypb.Query) (search.Query, error) {
+// UnmarshalProto will unmarshal a proto query.
+func UnmarshalProto(q *querypb.Query) (search.Query, error) {
 	switch q := q.Query.(type) {
 
 	case *querypb.Query_All:
@@ -64,7 +65,7 @@ func unmarshal(q *querypb.Query) (search.Query, error) {
 		return NewRegexpQuery(q.Regexp.Field, q.Regexp.Regexp)
 
 	case *querypb.Query_Negation:
-		inner, err := unmarshal(q.Negation.Query)
+		inner, err := UnmarshalProto(q.Negation.Query)
 		if err != nil {
 			return nil, err
 		}
@@ -73,7 +74,7 @@ func unmarshal(q *querypb.Query) (search.Query, error) {
 	case *querypb.Query_Conjunction:
 		qs := make([]search.Query, 0, len(q.Conjunction.Queries))
 		for _, qry := range q.Conjunction.Queries {
-			sqry, err := unmarshal(qry)
+			sqry, err := UnmarshalProto(qry)
 			if err != nil {
 				return nil, err
 			}
@@ -84,7 +85,7 @@ func unmarshal(q *querypb.Query) (search.Query, error) {
 	case *querypb.Query_Disjunction:
 		qs := make([]search.Query, 0, len(q.Disjunction.Queries))
 		for _, qry := range q.Disjunction.Queries {
-			sqry, err := unmarshal(qry)
+			sqry, err := UnmarshalProto(qry)
 			if err != nil {
 				return nil, err
 			}
diff --git a/src/x/sync/types.go b/src/x/sync/types.go
index 750ce94f85..a7c1809fd4 100644
--- a/src/x/sync/types.go
+++ b/src/x/sync/types.go
@@ -91,6 +91,9 @@ type WorkerPool interface {
 
 	// PutToken returns a token reserved by GetToken.
 	PutToken()
+
+	// Size returns the size of the worker pool.
+	Size() int
 }
 
 // PooledWorkerPoolOptions is the options for a PooledWorkerPool.
diff --git a/src/x/sync/worker_pool.go b/src/x/sync/worker_pool.go
index 11f80c4b76..58eb7df0f4 100644
--- a/src/x/sync/worker_pool.go
+++ b/src/x/sync/worker_pool.go
@@ -69,6 +69,10 @@ func (p *workerPool) PutToken() {
 	p.workCh <- struct{}{}
 }
 
+func (p *workerPool) Size() int {
+	return cap(p.workCh)
+}
+
 func (p *workerPool) GoWithTimeout(work Work, timeout time.Duration) bool {
 	// Attempt to try writing without allocating a ticker.
 	select {

From e151cbcc8c2aaac583741dc52d24ec74bcc2716a Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 10 Apr 2021 09:25:14 -0400
Subject: [PATCH 097/106] Fix search query proto

---
 src/dbnode/storage/index/postings_list_cache.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index 762d4357e9..9a1b385897 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -323,13 +323,18 @@ func (q *PostingsListCache) put(
 	searchQuery search.Query,
 	pl postings.List,
 ) {
+	var searchQueryProto *querypb.Query
+	if searchQuery != nil {
+		searchQueryProto = searchQuery.ToProto()
+	}
+
 	key := keyHash(segmentUUID, field, pattern, patternType)
 	value := &cachedPostings{
 		segmentUUID: segmentUUID,
 		field:       field,
 		pattern:     pattern,
 		patternType: patternType,
-		searchQuery: searchQuery.ToProto(),
+		searchQuery: searchQueryProto,
 		postings:    pl,
 	}
 	q.lru.Set(key, value, 1)

From b88605242ee4c3b68844e56690f41b3aa38d9910 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Sat, 10 Apr 2021 09:37:12 -0400
Subject: [PATCH 098/106] Fix missing postings for registry value

---
 src/dbnode/storage/index/postings_list_cache.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index 9a1b385897..f2a35701b6 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -463,6 +463,7 @@ func (q *PostingsListCache) processEventWithLock(ev postingsListEvent) {
 	}
 	value := registryValue{
 		searchQuery: ev.cachedPostings.searchQuery,
+		postings:    ev.cachedPostings.postings,
 	}
 	segmentPostings, ok := q.registry.active[uuid]
 	if !ok {

From a8677225bc0a30a594bb0a07287a5fab1b580edc Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 12 Apr 2021 11:51:30 -0400
Subject: [PATCH 099/106] Add a postings cache TTL of 15min, also sort
 conjunction query for deterministic query key, also returned optimized form

---
 src/dbnode/storage/index/mutable_segments.go  | 45 +++++----
 .../storage/index/postings_list_cache.go      | 98 +++++++++++++------
 .../storage/index/postings_list_cache_test.go | 76 +++++++-------
 .../storage/index/read_through_segment.go     | 13 ++-
 src/m3ninx/search/query/conjunction.go        |  8 ++
 src/m3ninx/search/query/disjunction.go        |  4 +
 6 files changed, 160 insertions(+), 84 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 79fa11a566..b32a7d5c0c 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -126,23 +126,25 @@ func (f *indexedBloomFilter) Write(id []byte) {
 }
 
 type mutableSegmentsMetrics struct {
-	foregroundCompactionPlanRunLatency                        tally.Timer
-	foregroundCompactionTaskRunLatency                        tally.Timer
-	backgroundCompactionPlanRunLatency                        tally.Timer
-	backgroundCompactionTaskRunLatency                        tally.Timer
-	activeBlockIndexNew                                       tally.Counter
-	activeBlockGarbageCollectSegment                          tally.Counter
-	activeBlockGarbageCollectSeries                           tally.Counter
-	activeBlockGarbageCollectEmptySegment                     tally.Counter
-	activeBlockGarbageCollectCachedSearchesDisabled           tally.Counter
-	activeBlockGarbageCollectCachedSearchesInRegistry         tally.Counter
-	activeBlockGarbageCollectCachedSearchesNotInRegistry      tally.Counter
-	activeBlockGarbageCollectCachedSearchesTotal              tally.Histogram
-	activeBlockGarbageCollectCachedSearchesMatched            tally.Histogram
-	activeBlockGarbageCollectReconstructCachedSearchSuccess   tally.Counter
-	activeBlockGarbageCollectReconstructCachedSearchError     tally.Counter
-	activeBlockGarbageCollectReconstructCachedSearchCacheHit  tally.Counter
-	activeBlockGarbageCollectReconstructCachedSearchCacheMiss tally.Counter
+	foregroundCompactionPlanRunLatency                          tally.Timer
+	foregroundCompactionTaskRunLatency                          tally.Timer
+	backgroundCompactionPlanRunLatency                          tally.Timer
+	backgroundCompactionTaskRunLatency                          tally.Timer
+	activeBlockIndexNew                                         tally.Counter
+	activeBlockGarbageCollectSegment                            tally.Counter
+	activeBlockGarbageCollectSeries                             tally.Counter
+	activeBlockGarbageCollectEmptySegment                       tally.Counter
+	activeBlockGarbageCollectCachedSearchesDisabled             tally.Counter
+	activeBlockGarbageCollectCachedSearchesInRegistry           tally.Counter
+	activeBlockGarbageCollectCachedSearchesNotInRegistry        tally.Counter
+	activeBlockGarbageCollectCachedSearchesTotal                tally.Histogram
+	activeBlockGarbageCollectCachedSearchesMatched              tally.Histogram
+	activeBlockGarbageCollectReconstructCachedSearchEvalSkip    tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchEvalAttempt tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchSuccess     tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchError       tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchCacheHit    tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchCacheMiss   tally.Counter
 }
 
 func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
@@ -171,6 +173,12 @@ func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
 			append(tally.ValueBuckets{0, 1}, tally.MustMakeExponentialValueBuckets(2, 2, 12)...)),
 		activeBlockGarbageCollectCachedSearchesMatched: backgroundScope.Histogram("gc-cached-searches-matched",
 			append(tally.ValueBuckets{0, 1}, tally.MustMakeExponentialValueBuckets(2, 2, 12)...)),
+		activeBlockGarbageCollectReconstructCachedSearchEvalSkip: backgroundScope.Tagged(map[string]string{
+			"eval_type": "skip",
+		}).Counter("gc-reconstruct-cached-search"),
+		activeBlockGarbageCollectReconstructCachedSearchEvalAttempt: backgroundScope.Tagged(map[string]string{
+			"eval_type": "attempt",
+		}).Counter("gc-reconstruct-cached-search"),
 		activeBlockGarbageCollectReconstructCachedSearchSuccess: backgroundScope.Tagged(map[string]string{
 			"result_type": "success",
 		}).Counter("gc-reconstruct-cached-search"),
@@ -973,9 +981,12 @@ func (m *mutableSegments) populateCachedSearches(
 			// expensive computation and we're not getting the benefit from
 			// running the computation anyway since these aren't searches
 			// that were cached in the largest segments we just compacted.
+			m.metrics.activeBlockGarbageCollectReconstructCachedSearchEvalSkip.Inc(1)
 			continue
 		}
 
+		m.metrics.activeBlockGarbageCollectReconstructCachedSearchEvalAttempt.Inc(1)
+
 		// Control concurrency by taking and returning token from worker pool.
 		w := <-workers
 		group.Go(instrumentedExec(func() error {
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index f2a35701b6..1fa2623148 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -63,6 +63,7 @@ const (
 
 	reportLoopInterval = 10 * time.Second
 	emptyPattern       = ""
+	lruTTL             = 15 * time.Minute
 )
 
 // PostingsListCacheOptions is the options struct for the query cache.
@@ -292,13 +293,21 @@ func (q *PostingsListCache) PutField(
 	q.put(segmentUUID, field, emptyPattern, PatternTypeField, nil, pl)
 }
 
+// PutSearchResult describes the put search operation result and if
+// the postings were copied into an optimized form for faster iteration.
+type PutSearchResult struct {
+	Optimized         bool
+	OptimizedPostings postings.List
+}
+
 // PutSearch updates the LRU with the result of a search query.
 func (q *PostingsListCache) PutSearch(
 	segmentUUID uuid.UUID,
 	queryStr string,
 	query search.Query,
 	pl postings.List,
-) {
+) PutSearchResult {
+	var result PutSearchResult
 	if roaring.IsComplexReadOnlyPostingsList(pl) {
 		// Copy into mutable postings list since it's expensive to read from
 		// a complex read only postings list over and over again (it's lazily
@@ -309,10 +318,14 @@ func (q *PostingsListCache) PutSearch(
 			q.logger.Error("unable to add postings iter", zap.Error(err))
 			return
 		}
+		result.Optimized = true
+		result.OptimizedPostings = mutable
 		pl = mutable
 	}
 
 	q.put(segmentUUID, queryStr, emptyPattern, PatternTypeSearch, query, pl)
+
+	return result
 }
 
 func (q *PostingsListCache) put(
@@ -337,7 +350,7 @@ func (q *PostingsListCache) put(
 		searchQuery: searchQueryProto,
 		postings:    pl,
 	}
-	q.lru.Set(key, value, 1)
+	q.lru.SetWithTTL(key, value, 1, lruTTL)
 	q.emitCachePutMetrics(patternType)
 	q.registry.eventCh <- postingsListEvent{
 		eventType:      addEventType,
@@ -411,9 +424,13 @@ type CachedPatternsResult struct {
 
 type CachedPatternForEachFn func(CachedPattern)
 
+type CachedPatternsQuery struct {
+	PatternType *PatternType
+}
+
 func (q *PostingsListCache) CachedPatterns(
 	uuid uuid.UUID,
-	patternType PatternType,
+	query CachedPatternsQuery,
 	fn CachedPatternForEachFn,
 ) CachedPatternsResult {
 	var result CachedPatternsResult
@@ -427,27 +444,21 @@ func (q *PostingsListCache) CachedPatterns(
 	}
 
 	result.InRegistry = true
-	for key := range segmentPostings {
-		if patternType == key.patternType {
-			result.TotalPatterns++
-		}
-	}
-	if result.TotalPatterns == 0 {
-		return CachedPatternsResult{}
-	}
-
+	result.TotalPatterns = len(segmentPostings)
 	for key, value := range segmentPostings {
-		if patternType == key.patternType {
-			fn(CachedPattern{
-				Field:          key.field,
-				Pattern:        key.pattern,
-				PatternType:    key.patternType,
-				SearchQueryKey: key.searchQueryKey,
-				SearchQuery:    value.searchQuery,
-				Postings:       value.postings,
-			})
-			result.MatchedPatterns++
+		if v := query.PatternType; v != nil && *v != key.patternType {
+			continue
 		}
+
+		fn(CachedPattern{
+			Field:          key.field,
+			Pattern:        key.pattern,
+			PatternType:    key.patternType,
+			SearchQueryKey: key.searchQueryKey,
+			SearchQuery:    value.searchQuery,
+			Postings:       value.postings,
+		})
+		result.MatchedPatterns++
 	}
 
 	return result
@@ -480,6 +491,8 @@ func (q *PostingsListCache) processEventWithLock(ev postingsListEvent) {
 	case addEventType:
 		segmentPostings[key] = value
 	}
+
+	q.emitRegistryMetrics(ev.cachedPostings.patternType, ev.eventType)
 }
 
 // Report will emit metrics about the status of the cache.
@@ -523,6 +536,31 @@ func (q *PostingsListCache) emitCachePutMetrics(patternType PatternType) {
 	}
 }
 
+func (q *PostingsListCache) emitRegistryMetrics(
+	patternType PatternType,
+	eventType postingsListEventType,
+) {
+	var method *postingsListCacheMethodMetrics
+	switch patternType {
+	case PatternTypeRegexp:
+		method = q.metrics.regexp
+	case PatternTypeTerm:
+		method = q.metrics.term
+	case PatternTypeField:
+		method = q.metrics.field
+	case PatternTypeSearch:
+		method = q.metrics.search
+	default:
+		method = q.metrics.unknown // should never happen
+	}
+	switch eventType {
+	case removeEventType:
+		method.registryRemoves.Inc(1)
+	case addEventType:
+		method.registryAdds.Inc(1)
+	}
+}
+
 type postingsListCacheMetrics struct {
 	regexp  *postingsListCacheMethodMetrics
 	term    *postingsListCacheMethodMetrics
@@ -570,15 +608,19 @@ func newPostingsListCacheMetrics(scope tally.Scope) *postingsListCacheMetrics {
 }
 
 type postingsListCacheMethodMetrics struct {
-	hits   tally.Counter
-	misses tally.Counter
-	puts   tally.Counter
+	hits            tally.Counter
+	misses          tally.Counter
+	puts            tally.Counter
+	registryAdds    tally.Counter
+	registryRemoves tally.Counter
 }
 
 func newPostingsListCacheMethodMetrics(scope tally.Scope) *postingsListCacheMethodMetrics {
 	return &postingsListCacheMethodMetrics{
-		hits:   scope.Counter("hits"),
-		misses: scope.Counter("misses"),
-		puts:   scope.Counter("puts"),
+		hits:            scope.Counter("hits"),
+		misses:          scope.Counter("misses"),
+		puts:            scope.Counter("puts"),
+		registryAdds:    scope.Counter("registry_adds"),
+		registryRemoves: scope.Counter("registry_removes"),
 	}
 }
diff --git a/src/dbnode/storage/index/postings_list_cache_test.go b/src/dbnode/storage/index/postings_list_cache_test.go
index e4edd01a3f..e69fa29842 100644
--- a/src/dbnode/storage/index/postings_list_cache_test.go
+++ b/src/dbnode/storage/index/postings_list_cache_test.go
@@ -71,7 +71,7 @@ func init() {
 
 		testPlEntries = append(testPlEntries, testEntry{
 			segmentUUID:  segmentUUID,
-			key:          newKey(field, pattern, patternType),
+			key:          keyHash(segmentUUID, field, pattern, patternType),
 			postingsList: pl,
 		})
 	}
@@ -79,7 +79,7 @@ func init() {
 
 type testEntry struct {
 	segmentUUID  uuid.UUID
-	key          key
+	key          uint64
 	postingsList postings.List
 }
 
@@ -273,70 +273,70 @@ func testConcurrency(t *testing.T, size int, purge bool, verify bool) {
 	}
 }
 
-func putEntry(t *testing.T, cache *PostingsListCache, i int) {
+func putEntry(t *testing.T, cache *PostingsListCache, entry testEntry) {
 	// Do each put twice to test the logic that avoids storing
 	// multiple entries for the same value.
 	switch testPlEntries[i].key.patternType {
 	case PatternTypeRegexp:
 		cache.PutRegexp(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
-			testPlEntries[i].postingsList,
+			entry.segmentUUID,
+			entry.key.field,
+			entry.key.pattern,
+			entry.postingsList,
 		)
 		cache.PutRegexp(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
-			testPlEntries[i].postingsList,
+			entry.segmentUUID,
+			entry.key.field,
+			entry.key.pattern,
+			entry.postingsList,
 		)
 	case PatternTypeTerm:
 		cache.PutTerm(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
-			testPlEntries[i].postingsList,
+			entry.segmentUUID,
+			entry.key.field,
+			entry.key.pattern,
+			entry.postingsList,
 		)
 		cache.PutTerm(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
-			testPlEntries[i].postingsList,
+			entry.segmentUUID,
+			entry.key.field,
+			entry.key.pattern,
+			entry.postingsList,
 		)
 	case PatternTypeField:
 		cache.PutField(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].postingsList,
+			entry.segmentUUID,
+			entry.key.field,
+			entry.postingsList,
 		)
 		cache.PutField(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].postingsList,
+			entry.segmentUUID,
+			entry.key.field,
+			entry.postingsList,
 		)
 	default:
 		require.FailNow(t, "unknown pattern type", testPlEntries[i].key.patternType)
 	}
 }
 
-func getEntry(t *testing.T, cache *PostingsListCache, i int) (postings.List, bool) {
+func getEntry(t *testing.T, cache *PostingsListCache, entry testEntry) (postings.List, bool) {
 	switch testPlEntries[i].key.patternType {
 	case PatternTypeRegexp:
 		return cache.GetRegexp(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
+			entry.segmentUUID,
+			entry.key.field,
+			entry.key.pattern,
 		)
 	case PatternTypeTerm:
 		return cache.GetTerm(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
+			entry.segmentUUID,
+			entry.key.field,
+			entry.key.pattern,
 		)
 	case PatternTypeField:
 		return cache.GetField(
-			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
+			entry.segmentUUID,
+			entry.key.field,
 		)
 	default:
 		require.FailNow(t, "unknown pattern type", testPlEntries[i].key.patternType)
@@ -344,9 +344,11 @@ func getEntry(t *testing.T, cache *PostingsListCache, i int) (postings.List, boo
 	return nil, false
 }
 
-func requireExpectedOrder(t *testing.T, plCache *PostingsListCache, expectedOrder []testEntry) {
-	for i, key := range plCache.lru.keys() {
-		require.Equal(t, expectedOrder[i].key, key)
+func requireContains(t *testing.T, plCache *PostingsListCache, values []testEntry) {
+	// Wait for registry to catchup.
+	for _, value := range values {
+		_, ok := getEntry(t, plCache, value)
+		require.True(t, ok)
 	}
 }
 
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index 93e2e173ec..395617e97d 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -189,7 +189,10 @@ func (r *ReadThroughSegment) CachedSearchPatterns(
 		}
 	}
 
-	result := cache.CachedPatterns(r.uuid, PatternTypeSearch, fn)
+	patternType := PatternTypeSearch
+	result := cache.CachedPatterns(r.uuid, CachedPatternsQuery{
+		PatternType: &patternType,
+	}, fn)
 	return CachedSearchPatternsResult{
 		CachedPatternsResult: result,
 	}
@@ -381,7 +384,13 @@ func (s *readThroughSegmentReader) Search(
 		return nil, err
 	}
 
-	cache.PutSearch(s.uuid, queryStr, query, pl)
+	result := cache.PutSearch(s.uuid, queryStr, query, pl)
+	if result.Optimized {
+		// If the result was optimized for faster iteration speed when
+		// retrieved from cache, then use that for the response for this
+		// query too.
+		pl = result.OptimizedPostings
+	}
 
 	return pl, nil
 }
diff --git a/src/m3ninx/search/query/conjunction.go b/src/m3ninx/search/query/conjunction.go
index 33c896dd26..33b695f6a3 100644
--- a/src/m3ninx/search/query/conjunction.go
+++ b/src/m3ninx/search/query/conjunction.go
@@ -60,6 +60,14 @@ func NewConjunctionQuery(queries []search.Query) search.Query {
 		ns = ns[1:]
 	}
 
+	// Cause a sort of the queries/negations for deterministic cache key.
+	sort.Slice(qs, func(i, j int) bool {
+		return qs[i].String() < qs[j].String()
+	})
+	sort.Slice(ns, func(i, j int) bool {
+		return ns[i].String() < ns[j].String()
+	})
+
 	q := &ConjuctionQuery{
 		queries:   qs,
 		negations: ns,
diff --git a/src/m3ninx/search/query/disjunction.go b/src/m3ninx/search/query/disjunction.go
index c0c6acd452..caedead376 100644
--- a/src/m3ninx/search/query/disjunction.go
+++ b/src/m3ninx/search/query/disjunction.go
@@ -48,6 +48,10 @@ func NewDisjunctionQuery(queries []search.Query) search.Query {
 
 		qs = append(qs, query)
 	}
+	// Cause a sort of the queries/negations for deterministic cache key.
+	sort.Slice(qs, func(i, j int) bool {
+		return qs[i].String() < qs[j].String()
+	})
 	q := &DisjuctionQuery{
 		queries: qs,
 	}

From b8f4d8283165cee2fcc28e376039fd0f72be2c0a Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Mon, 12 Apr 2021 11:57:33 -0400
Subject: [PATCH 100/106] Fix build

---
 src/dbnode/storage/index/postings_list_cache.go | 7 +++++--
 src/m3ninx/search/query/conjunction.go          | 1 +
 src/m3ninx/search/query/disjunction.go          | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index 1fa2623148..6d032773ec 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -315,8 +315,11 @@ func (q *PostingsListCache) PutSearch(
 		mutable := q.opts.PostingsListPool.Get()
 		if err := mutable.AddIterator(pl.Iterator()); err != nil {
 			q.metrics.pooledGetErrAddIter.Inc(1)
-			q.logger.Error("unable to add postings iter", zap.Error(err))
-			return
+			iopts := q.opts.InstrumentOptions
+			instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) {
+				l.Error("unable to add postings iter", zap.Error(err))
+			})
+			return result
 		}
 		result.Optimized = true
 		result.OptimizedPostings = mutable
diff --git a/src/m3ninx/search/query/conjunction.go b/src/m3ninx/search/query/conjunction.go
index 33b695f6a3..12b429973a 100644
--- a/src/m3ninx/search/query/conjunction.go
+++ b/src/m3ninx/search/query/conjunction.go
@@ -21,6 +21,7 @@
 package query
 
 import (
+	"sort"
 	"strings"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
diff --git a/src/m3ninx/search/query/disjunction.go b/src/m3ninx/search/query/disjunction.go
index caedead376..51dc83c6dc 100644
--- a/src/m3ninx/search/query/disjunction.go
+++ b/src/m3ninx/search/query/disjunction.go
@@ -21,6 +21,7 @@
 package query
 
 import (
+	"sort"
 	"strings"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"

From 34a8594075d0ba5a81456070d480e17432b50e26 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 13 Apr 2021 08:03:48 -0400
Subject: [PATCH 101/106] Restore simple LRU cache for accuracy when compacting
 segments

---
 src/dbnode/storage/index/block_bench_test.go  |  26 +-
 src/dbnode/storage/index/block_test.go        |  38 +-
 .../index/filter_fields_iterator_test.go      |  21 +-
 src/dbnode/storage/index/index_mock.go        | 404 +++++++++++++++---
 src/dbnode/storage/index/mutable_segments.go  |  32 +-
 .../storage/index/mutable_segments_test.go    |  75 ++++
 .../storage/index/postings_list_cache.go      | 215 ++--------
 .../storage/index/postings_list_cache_lru.go  | 235 ++++++++++
 .../index/postings_list_cache_lru_test.go     |  32 ++
 .../storage/index/postings_list_cache_test.go |  76 ++--
 .../storage/index/read_through_segment.go     |  13 +
 .../index/read_through_segment_test.go        |  88 ++--
 src/dbnode/storage/storage_mock.go            |  29 --
 src/m3ninx/index/index_mock.go                |  30 ++
 src/m3ninx/index/segment/segment_mock.go      |  95 ++++
 src/m3ninx/search/search_mock.go              |  38 ++
 16 files changed, 1073 insertions(+), 374 deletions(-)
 create mode 100644 src/dbnode/storage/index/mutable_segments_test.go
 create mode 100644 src/dbnode/storage/index/postings_list_cache_lru.go
 create mode 100644 src/dbnode/storage/index/postings_list_cache_lru_test.go

diff --git a/src/dbnode/storage/index/block_bench_test.go b/src/dbnode/storage/index/block_bench_test.go
index ec742a977a..94971b75f2 100644
--- a/src/dbnode/storage/index/block_bench_test.go
+++ b/src/dbnode/storage/index/block_bench_test.go
@@ -60,10 +60,10 @@ func BenchmarkBlockWrite(b *testing.B) {
 	})
 
 	fieldValues := map[string][]string{
-		"fruit":     []string{"apple", "banana", "orange", "watermelon"},
-		"vegetable": []string{"broccoli", "carrot", "celery", "cucumber"},
-		"meat":      []string{"beef", "chicken", "pork", "steak"},
-		"cheese":    []string{"cheddar", "swiss", "brie", "bleu"},
+		"fruit":     {"apple", "banana", "orange", "watermelon"},
+		"vegetable": {"broccoli", "carrot", "celery", "cucumber"},
+		"meat":      {"beef", "chicken", "pork", "steak"},
+		"cheese":    {"cheddar", "swiss", "brie", "bleu"},
 	}
 
 	for i := 0; i < 4096; i++ {
@@ -123,3 +123,21 @@ func (m mockOnIndexSeries) OnIndexPrepare()                           {}
 func (m mockOnIndexSeries) NeedsIndexUpdate(indexBlockStartForWrite xtime.UnixNano) bool {
 	return false
 }
+
+func (m mockOnIndexSeries) IfAlreadyIndexedMarkIndexSuccessAndFinalize(
+	blockStart xtime.UnixNano,
+) bool {
+	return false
+}
+
+func (m mockOnIndexSeries) RemoveIndexedForBlockStarts(
+	blockStarts map[xtime.UnixNano]struct{},
+) RemoveIndexedForBlockStartsResult {
+	return RemoveIndexedForBlockStartsResult{}
+}
+
+func (m mockOnIndexSeries) RelookupAndIncrementReaderWriterCount() (OnIndexSeries, bool) {
+	return nil, false
+}
+
+func (m mockOnIndexSeries) DecrementReaderWriterCount() {}
diff --git a/src/dbnode/storage/index/block_test.go b/src/dbnode/storage/index/block_test.go
index f33c6b71e9..7fd5eddab4 100644
--- a/src/dbnode/storage/index/block_test.go
+++ b/src/dbnode/storage/index/block_test.go
@@ -1774,13 +1774,13 @@ func TestBlockWriteBackgroundCompact(t *testing.T) {
 		{Segment: b.mutableSegments.foregroundSegments[0].Segment()},
 	})
 	require.Equal(t, 2, len(b.mutableSegments.backgroundSegments))
-	require.True(t, b.mutableSegments.compact.compactingBackground)
+	require.True(t, b.mutableSegments.compact.compactingBackgroundStandard)
 	b.mutableSegments.Unlock()
 
 	// Wait for compaction to finish
 	for {
 		b.mutableSegments.RLock()
-		compacting := b.mutableSegments.compact.compactingBackground
+		compacting := b.mutableSegments.compact.compactingBackgroundStandard
 		b.mutableSegments.RUnlock()
 		if !compacting {
 			break
@@ -2241,3 +2241,37 @@ func testDoc3() doc.Metadata {
 		},
 	}
 }
+
+func testDocN(n int) doc.Metadata {
+	return doc.Metadata{
+		ID: []byte(fmt.Sprintf("doc-%d", n)),
+		Fields: []doc.Field{
+			{
+				Name:  []byte("foo"),
+				Value: []byte("bar"),
+			},
+			{
+				Name: []byte("bucket-0"),
+				Value: moduloByteStr([]string{
+					"one",
+					"two",
+					"three",
+				}, n),
+			},
+			{
+				Name: []byte("bucket-1"),
+				Value: moduloByteStr([]string{
+					"one",
+					"two",
+					"three",
+					"four",
+					"five",
+				}, n),
+			},
+		},
+	}
+}
+
+func moduloByteStr(strs []string, n int) []byte {
+	return []byte(strs[n%len(strs)])
+}
diff --git a/src/dbnode/storage/index/filter_fields_iterator_test.go b/src/dbnode/storage/index/filter_fields_iterator_test.go
index 9e584011a4..10c6efee59 100644
--- a/src/dbnode/storage/index/filter_fields_iterator_test.go
+++ b/src/dbnode/storage/index/filter_fields_iterator_test.go
@@ -24,6 +24,7 @@ import (
 	"testing"
 
 	"github.com/m3db/m3/src/m3ninx/index/segment"
+	"github.com/m3db/m3/src/m3ninx/postings"
 	xtest "github.com/m3db/m3/src/x/test"
 
 	"github.com/golang/mock/gomock"
@@ -69,12 +70,16 @@ func TestNewFilterFieldsIteratorFirstMatch(t *testing.T) {
 		r.EXPECT().ContainsField([]byte("c")).Return(false, nil),
 	)
 	require.True(t, iter.Next())
-	require.Equal(t, "a", string(iter.Current()))
+	require.Equal(t, "a", iterCurrTerm(iter.Current()))
 	require.False(t, iter.Next())
 	require.NoError(t, iter.Err())
 	require.NoError(t, iter.Close())
 }
 
+func iterCurrTerm(term []byte, _ postings.List) string {
+	return string(term)
+}
+
 func TestNewFilterFieldsIteratorMiddleMatch(t *testing.T) {
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
@@ -90,7 +95,7 @@ func TestNewFilterFieldsIteratorMiddleMatch(t *testing.T) {
 		r.EXPECT().ContainsField([]byte("c")).Return(false, nil),
 	)
 	require.True(t, iter.Next())
-	require.Equal(t, "b", string(iter.Current()))
+	require.Equal(t, "b", iterCurrTerm(iter.Current()))
 	require.False(t, iter.Next())
 	require.NoError(t, iter.Err())
 	require.NoError(t, iter.Close())
@@ -111,7 +116,7 @@ func TestNewFilterFieldsIteratorEndMatch(t *testing.T) {
 		r.EXPECT().ContainsField([]byte("c")).Return(true, nil),
 	)
 	require.True(t, iter.Next())
-	require.Equal(t, "c", string(iter.Current()))
+	require.Equal(t, "c", iterCurrTerm(iter.Current()))
 	require.False(t, iter.Next())
 	require.NoError(t, iter.Err())
 	require.NoError(t, iter.Close())
@@ -132,11 +137,11 @@ func TestNewFilterFieldsIteratorAllMatch(t *testing.T) {
 		r.EXPECT().ContainsField([]byte("c")).Return(true, nil),
 	)
 	require.True(t, iter.Next())
-	require.Equal(t, "a", string(iter.Current()))
+	require.Equal(t, "a", iterCurrTerm(iter.Current()))
 	require.True(t, iter.Next())
-	require.Equal(t, "b", string(iter.Current()))
+	require.Equal(t, "b", iterCurrTerm(iter.Current()))
 	require.True(t, iter.Next())
-	require.Equal(t, "c", string(iter.Current()))
+	require.Equal(t, "c", iterCurrTerm(iter.Current()))
 	require.False(t, iter.Next())
 	require.NoError(t, iter.Err())
 	require.NoError(t, iter.Close())
@@ -157,9 +162,9 @@ func TestNewFilterFieldsIteratorRandomMatch(t *testing.T) {
 		r.EXPECT().ContainsField([]byte("c")).Return(true, nil),
 	)
 	require.True(t, iter.Next())
-	require.Equal(t, "a", string(iter.Current()))
+	require.Equal(t, "a", iterCurrTerm(iter.Current()))
 	require.True(t, iter.Next())
-	require.Equal(t, "c", string(iter.Current()))
+	require.Equal(t, "c", iterCurrTerm(iter.Current()))
 	require.False(t, iter.Next())
 	require.NoError(t, iter.Err())
 	require.NoError(t, iter.Close())
diff --git a/src/dbnode/storage/index/index_mock.go b/src/dbnode/storage/index/index_mock.go
index 0c44cf9614..6ad6ab3076 100644
--- a/src/dbnode/storage/index/index_mock.go
+++ b/src/dbnode/storage/index/index_mock.go
@@ -43,6 +43,7 @@ import (
 	"github.com/m3db/m3/src/x/mmap"
 	"github.com/m3db/m3/src/x/pool"
 	"github.com/m3db/m3/src/x/resource"
+	"github.com/m3db/m3/src/x/sync"
 	time0 "github.com/m3db/m3/src/x/time"
 
 	"github.com/golang/mock/gomock"
@@ -72,6 +73,36 @@ func (m *MockBaseResults) EXPECT() *MockBaseResultsMockRecorder {
 	return m.recorder
 }
 
+// EnforceLimits mocks base method
+func (m *MockBaseResults) EnforceLimits() bool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "EnforceLimits")
+	ret0, _ := ret[0].(bool)
+	return ret0
+}
+
+// EnforceLimits indicates an expected call of EnforceLimits
+func (mr *MockBaseResultsMockRecorder) EnforceLimits() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnforceLimits", reflect.TypeOf((*MockBaseResults)(nil).EnforceLimits))
+}
+
+// AddDocuments mocks base method
+func (m *MockBaseResults) AddDocuments(batch []doc.Document) (int, int, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "AddDocuments", batch)
+	ret0, _ := ret[0].(int)
+	ret1, _ := ret[1].(int)
+	ret2, _ := ret[2].(error)
+	return ret0, ret1, ret2
+}
+
+// AddDocuments indicates an expected call of AddDocuments
+func (mr *MockBaseResultsMockRecorder) AddDocuments(batch interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddDocuments", reflect.TypeOf((*MockBaseResults)(nil).AddDocuments), batch)
+}
+
 // Namespace mocks base method
 func (m *MockBaseResults) Namespace() ident.ID {
 	m.ctrl.T.Helper()
@@ -114,8 +145,58 @@ func (mr *MockBaseResultsMockRecorder) TotalDocsCount() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TotalDocsCount", reflect.TypeOf((*MockBaseResults)(nil).TotalDocsCount))
 }
 
+// NonConcurrentBuilder mocks base method
+func (m *MockBaseResults) NonConcurrentBuilder() (BaseResultsBuilder, bool) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "NonConcurrentBuilder")
+	ret0, _ := ret[0].(BaseResultsBuilder)
+	ret1, _ := ret[1].(bool)
+	return ret0, ret1
+}
+
+// NonConcurrentBuilder indicates an expected call of NonConcurrentBuilder
+func (mr *MockBaseResultsMockRecorder) NonConcurrentBuilder() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NonConcurrentBuilder", reflect.TypeOf((*MockBaseResults)(nil).NonConcurrentBuilder))
+}
+
+// Finalize mocks base method
+func (m *MockBaseResults) Finalize() {
+	m.ctrl.T.Helper()
+	m.ctrl.Call(m, "Finalize")
+}
+
+// Finalize indicates an expected call of Finalize
+func (mr *MockBaseResultsMockRecorder) Finalize() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Finalize", reflect.TypeOf((*MockBaseResults)(nil).Finalize))
+}
+
+// MockBaseResultsBuilder is a mock of BaseResultsBuilder interface
+type MockBaseResultsBuilder struct {
+	ctrl     *gomock.Controller
+	recorder *MockBaseResultsBuilderMockRecorder
+}
+
+// MockBaseResultsBuilderMockRecorder is the mock recorder for MockBaseResultsBuilder
+type MockBaseResultsBuilderMockRecorder struct {
+	mock *MockBaseResultsBuilder
+}
+
+// NewMockBaseResultsBuilder creates a new mock instance
+func NewMockBaseResultsBuilder(ctrl *gomock.Controller) *MockBaseResultsBuilder {
+	mock := &MockBaseResultsBuilder{ctrl: ctrl}
+	mock.recorder = &MockBaseResultsBuilderMockRecorder{mock}
+	return mock
+}
+
+// EXPECT returns an object that allows the caller to indicate expected use
+func (m *MockBaseResultsBuilder) EXPECT() *MockBaseResultsBuilderMockRecorder {
+	return m.recorder
+}
+
 // EnforceLimits mocks base method
-func (m *MockBaseResults) EnforceLimits() bool {
+func (m *MockBaseResultsBuilder) EnforceLimits() bool {
 	m.ctrl.T.Helper()
 	ret := m.ctrl.Call(m, "EnforceLimits")
 	ret0, _ := ret[0].(bool)
@@ -123,13 +204,13 @@ func (m *MockBaseResults) EnforceLimits() bool {
 }
 
 // EnforceLimits indicates an expected call of EnforceLimits
-func (mr *MockBaseResultsMockRecorder) EnforceLimits() *gomock.Call {
+func (mr *MockBaseResultsBuilderMockRecorder) EnforceLimits() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnforceLimits", reflect.TypeOf((*MockBaseResults)(nil).EnforceLimits))
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnforceLimits", reflect.TypeOf((*MockBaseResultsBuilder)(nil).EnforceLimits))
 }
 
 // AddDocuments mocks base method
-func (m *MockBaseResults) AddDocuments(batch []doc.Document) (int, int, error) {
+func (m *MockBaseResultsBuilder) AddDocuments(batch []doc.Document) (int, int, error) {
 	m.ctrl.T.Helper()
 	ret := m.ctrl.Call(m, "AddDocuments", batch)
 	ret0, _ := ret[0].(int)
@@ -139,21 +220,9 @@ func (m *MockBaseResults) AddDocuments(batch []doc.Document) (int, int, error) {
 }
 
 // AddDocuments indicates an expected call of AddDocuments
-func (mr *MockBaseResultsMockRecorder) AddDocuments(batch interface{}) *gomock.Call {
+func (mr *MockBaseResultsBuilderMockRecorder) AddDocuments(batch interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddDocuments", reflect.TypeOf((*MockBaseResults)(nil).AddDocuments), batch)
-}
-
-// Finalize mocks base method
-func (m *MockBaseResults) Finalize() {
-	m.ctrl.T.Helper()
-	m.ctrl.Call(m, "Finalize")
-}
-
-// Finalize indicates an expected call of Finalize
-func (mr *MockBaseResultsMockRecorder) Finalize() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Finalize", reflect.TypeOf((*MockBaseResults)(nil).Finalize))
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddDocuments", reflect.TypeOf((*MockBaseResultsBuilder)(nil).AddDocuments), batch)
 }
 
 // MockQueryResults is a mock of QueryResults interface
@@ -179,6 +248,36 @@ func (m *MockQueryResults) EXPECT() *MockQueryResultsMockRecorder {
 	return m.recorder
 }
 
+// EnforceLimits mocks base method
+func (m *MockQueryResults) EnforceLimits() bool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "EnforceLimits")
+	ret0, _ := ret[0].(bool)
+	return ret0
+}
+
+// EnforceLimits indicates an expected call of EnforceLimits
+func (mr *MockQueryResultsMockRecorder) EnforceLimits() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnforceLimits", reflect.TypeOf((*MockQueryResults)(nil).EnforceLimits))
+}
+
+// AddDocuments mocks base method
+func (m *MockQueryResults) AddDocuments(batch []doc.Document) (int, int, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "AddDocuments", batch)
+	ret0, _ := ret[0].(int)
+	ret1, _ := ret[1].(int)
+	ret2, _ := ret[2].(error)
+	return ret0, ret1, ret2
+}
+
+// AddDocuments indicates an expected call of AddDocuments
+func (mr *MockQueryResultsMockRecorder) AddDocuments(batch interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddDocuments", reflect.TypeOf((*MockQueryResults)(nil).AddDocuments), batch)
+}
+
 // Namespace mocks base method
 func (m *MockQueryResults) Namespace() ident.ID {
 	m.ctrl.T.Helper()
@@ -221,34 +320,19 @@ func (mr *MockQueryResultsMockRecorder) TotalDocsCount() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TotalDocsCount", reflect.TypeOf((*MockQueryResults)(nil).TotalDocsCount))
 }
 
-// EnforceLimits mocks base method
-func (m *MockQueryResults) EnforceLimits() bool {
+// NonConcurrentBuilder mocks base method
+func (m *MockQueryResults) NonConcurrentBuilder() (BaseResultsBuilder, bool) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "EnforceLimits")
-	ret0, _ := ret[0].(bool)
-	return ret0
-}
-
-// EnforceLimits indicates an expected call of EnforceLimits
-func (mr *MockQueryResultsMockRecorder) EnforceLimits() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnforceLimits", reflect.TypeOf((*MockQueryResults)(nil).EnforceLimits))
-}
-
-// AddDocuments mocks base method
-func (m *MockQueryResults) AddDocuments(batch []doc.Document) (int, int, error) {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "AddDocuments", batch)
-	ret0, _ := ret[0].(int)
-	ret1, _ := ret[1].(int)
-	ret2, _ := ret[2].(error)
-	return ret0, ret1, ret2
+	ret := m.ctrl.Call(m, "NonConcurrentBuilder")
+	ret0, _ := ret[0].(BaseResultsBuilder)
+	ret1, _ := ret[1].(bool)
+	return ret0, ret1
 }
 
-// AddDocuments indicates an expected call of AddDocuments
-func (mr *MockQueryResultsMockRecorder) AddDocuments(batch interface{}) *gomock.Call {
+// NonConcurrentBuilder indicates an expected call of NonConcurrentBuilder
+func (mr *MockQueryResultsMockRecorder) NonConcurrentBuilder() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddDocuments", reflect.TypeOf((*MockQueryResults)(nil).AddDocuments), batch)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NonConcurrentBuilder", reflect.TypeOf((*MockQueryResults)(nil).NonConcurrentBuilder))
 }
 
 // Finalize mocks base method
@@ -373,6 +457,36 @@ func (m *MockAggregateResults) EXPECT() *MockAggregateResultsMockRecorder {
 	return m.recorder
 }
 
+// EnforceLimits mocks base method
+func (m *MockAggregateResults) EnforceLimits() bool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "EnforceLimits")
+	ret0, _ := ret[0].(bool)
+	return ret0
+}
+
+// EnforceLimits indicates an expected call of EnforceLimits
+func (mr *MockAggregateResultsMockRecorder) EnforceLimits() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnforceLimits", reflect.TypeOf((*MockAggregateResults)(nil).EnforceLimits))
+}
+
+// AddDocuments mocks base method
+func (m *MockAggregateResults) AddDocuments(batch []doc.Document) (int, int, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "AddDocuments", batch)
+	ret0, _ := ret[0].(int)
+	ret1, _ := ret[1].(int)
+	ret2, _ := ret[2].(error)
+	return ret0, ret1, ret2
+}
+
+// AddDocuments indicates an expected call of AddDocuments
+func (mr *MockAggregateResultsMockRecorder) AddDocuments(batch interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddDocuments", reflect.TypeOf((*MockAggregateResults)(nil).AddDocuments), batch)
+}
+
 // Namespace mocks base method
 func (m *MockAggregateResults) Namespace() ident.ID {
 	m.ctrl.T.Helper()
@@ -415,34 +529,19 @@ func (mr *MockAggregateResultsMockRecorder) TotalDocsCount() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TotalDocsCount", reflect.TypeOf((*MockAggregateResults)(nil).TotalDocsCount))
 }
 
-// EnforceLimits mocks base method
-func (m *MockAggregateResults) EnforceLimits() bool {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "EnforceLimits")
-	ret0, _ := ret[0].(bool)
-	return ret0
-}
-
-// EnforceLimits indicates an expected call of EnforceLimits
-func (mr *MockAggregateResultsMockRecorder) EnforceLimits() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnforceLimits", reflect.TypeOf((*MockAggregateResults)(nil).EnforceLimits))
-}
-
-// AddDocuments mocks base method
-func (m *MockAggregateResults) AddDocuments(batch []doc.Document) (int, int, error) {
+// NonConcurrentBuilder mocks base method
+func (m *MockAggregateResults) NonConcurrentBuilder() (BaseResultsBuilder, bool) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "AddDocuments", batch)
-	ret0, _ := ret[0].(int)
-	ret1, _ := ret[1].(int)
-	ret2, _ := ret[2].(error)
-	return ret0, ret1, ret2
+	ret := m.ctrl.Call(m, "NonConcurrentBuilder")
+	ret0, _ := ret[0].(BaseResultsBuilder)
+	ret1, _ := ret[1].(bool)
+	return ret0, ret1
 }
 
-// AddDocuments indicates an expected call of AddDocuments
-func (mr *MockAggregateResultsMockRecorder) AddDocuments(batch interface{}) *gomock.Call {
+// NonConcurrentBuilder indicates an expected call of NonConcurrentBuilder
+func (mr *MockAggregateResultsMockRecorder) NonConcurrentBuilder() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddDocuments", reflect.TypeOf((*MockAggregateResults)(nil).AddDocuments), batch)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NonConcurrentBuilder", reflect.TypeOf((*MockAggregateResults)(nil).NonConcurrentBuilder))
 }
 
 // Finalize mocks base method
@@ -707,6 +806,61 @@ func (mr *MockOnIndexSeriesMockRecorder) NeedsIndexUpdate(indexBlockStartForWrit
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NeedsIndexUpdate", reflect.TypeOf((*MockOnIndexSeries)(nil).NeedsIndexUpdate), indexBlockStartForWrite)
 }
 
+// IfAlreadyIndexedMarkIndexSuccessAndFinalize mocks base method
+func (m *MockOnIndexSeries) IfAlreadyIndexedMarkIndexSuccessAndFinalize(blockStart time0.UnixNano) bool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "IfAlreadyIndexedMarkIndexSuccessAndFinalize", blockStart)
+	ret0, _ := ret[0].(bool)
+	return ret0
+}
+
+// IfAlreadyIndexedMarkIndexSuccessAndFinalize indicates an expected call of IfAlreadyIndexedMarkIndexSuccessAndFinalize
+func (mr *MockOnIndexSeriesMockRecorder) IfAlreadyIndexedMarkIndexSuccessAndFinalize(blockStart interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IfAlreadyIndexedMarkIndexSuccessAndFinalize", reflect.TypeOf((*MockOnIndexSeries)(nil).IfAlreadyIndexedMarkIndexSuccessAndFinalize), blockStart)
+}
+
+// RemoveIndexedForBlockStarts mocks base method
+func (m *MockOnIndexSeries) RemoveIndexedForBlockStarts(blockStarts map[time0.UnixNano]struct{}) RemoveIndexedForBlockStartsResult {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "RemoveIndexedForBlockStarts", blockStarts)
+	ret0, _ := ret[0].(RemoveIndexedForBlockStartsResult)
+	return ret0
+}
+
+// RemoveIndexedForBlockStarts indicates an expected call of RemoveIndexedForBlockStarts
+func (mr *MockOnIndexSeriesMockRecorder) RemoveIndexedForBlockStarts(blockStarts interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveIndexedForBlockStarts", reflect.TypeOf((*MockOnIndexSeries)(nil).RemoveIndexedForBlockStarts), blockStarts)
+}
+
+// RelookupAndIncrementReaderWriterCount mocks base method
+func (m *MockOnIndexSeries) RelookupAndIncrementReaderWriterCount() (OnIndexSeries, bool) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "RelookupAndIncrementReaderWriterCount")
+	ret0, _ := ret[0].(OnIndexSeries)
+	ret1, _ := ret[1].(bool)
+	return ret0, ret1
+}
+
+// RelookupAndIncrementReaderWriterCount indicates an expected call of RelookupAndIncrementReaderWriterCount
+func (mr *MockOnIndexSeriesMockRecorder) RelookupAndIncrementReaderWriterCount() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RelookupAndIncrementReaderWriterCount", reflect.TypeOf((*MockOnIndexSeries)(nil).RelookupAndIncrementReaderWriterCount))
+}
+
+// DecrementReaderWriterCount mocks base method
+func (m *MockOnIndexSeries) DecrementReaderWriterCount() {
+	m.ctrl.T.Helper()
+	m.ctrl.Call(m, "DecrementReaderWriterCount")
+}
+
+// DecrementReaderWriterCount indicates an expected call of DecrementReaderWriterCount
+func (mr *MockOnIndexSeriesMockRecorder) DecrementReaderWriterCount() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DecrementReaderWriterCount", reflect.TypeOf((*MockOnIndexSeries)(nil).DecrementReaderWriterCount))
+}
+
 // MockBlock is a mock of Block interface
 type MockBlock struct {
 	ctrl     *gomock.Controller
@@ -730,6 +884,20 @@ func (m *MockBlock) EXPECT() *MockBlockMockRecorder {
 	return m.recorder
 }
 
+// InMemoryBlockNotifySealedBlocks mocks base method
+func (m *MockBlock) InMemoryBlockNotifySealedBlocks(sealed []time0.UnixNano) error {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "InMemoryBlockNotifySealedBlocks", sealed)
+	ret0, _ := ret[0].(error)
+	return ret0
+}
+
+// InMemoryBlockNotifySealedBlocks indicates an expected call of InMemoryBlockNotifySealedBlocks
+func (mr *MockBlockMockRecorder) InMemoryBlockNotifySealedBlocks(sealed interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InMemoryBlockNotifySealedBlocks", reflect.TypeOf((*MockBlock)(nil).InMemoryBlockNotifySealedBlocks), sealed)
+}
+
 // StartTime mocks base method
 func (m *MockBlock) StartTime() time.Time {
 	m.ctrl.T.Helper()
@@ -846,6 +1014,20 @@ func (mr *MockBlockMockRecorder) Stats(reporter interface{}) *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stats", reflect.TypeOf((*MockBlock)(nil).Stats), reporter)
 }
 
+// IsOpen mocks base method
+func (m *MockBlock) IsOpen() bool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "IsOpen")
+	ret0, _ := ret[0].(bool)
+	return ret0
+}
+
+// IsOpen indicates an expected call of IsOpen
+func (mr *MockBlockMockRecorder) IsOpen() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsOpen", reflect.TypeOf((*MockBlock)(nil).IsOpen))
+}
+
 // Seal mocks base method
 func (m *MockBlock) Seal() error {
 	m.ctrl.T.Helper()
@@ -931,9 +1113,11 @@ func (mr *MockBlockMockRecorder) EvictColdMutableSegments() *gomock.Call {
 }
 
 // RotateColdMutableSegments mocks base method
-func (m *MockBlock) RotateColdMutableSegments() {
+func (m *MockBlock) RotateColdMutableSegments() error {
 	m.ctrl.T.Helper()
-	m.ctrl.Call(m, "RotateColdMutableSegments")
+	ret := m.ctrl.Call(m, "RotateColdMutableSegments")
+	ret0, _ := ret[0].(error)
+	return ret0
 }
 
 // RotateColdMutableSegments indicates an expected call of RotateColdMutableSegments
@@ -1625,6 +1809,34 @@ func (mr *MockOptionsMockRecorder) PostingsListCache() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PostingsListCache", reflect.TypeOf((*MockOptions)(nil).PostingsListCache))
 }
 
+// SetSearchPostingsListCache mocks base method
+func (m *MockOptions) SetSearchPostingsListCache(value *PostingsListCache) Options {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "SetSearchPostingsListCache", value)
+	ret0, _ := ret[0].(Options)
+	return ret0
+}
+
+// SetSearchPostingsListCache indicates an expected call of SetSearchPostingsListCache
+func (mr *MockOptionsMockRecorder) SetSearchPostingsListCache(value interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetSearchPostingsListCache", reflect.TypeOf((*MockOptions)(nil).SetSearchPostingsListCache), value)
+}
+
+// SearchPostingsListCache mocks base method
+func (m *MockOptions) SearchPostingsListCache() *PostingsListCache {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "SearchPostingsListCache")
+	ret0, _ := ret[0].(*PostingsListCache)
+	return ret0
+}
+
+// SearchPostingsListCache indicates an expected call of SearchPostingsListCache
+func (mr *MockOptionsMockRecorder) SearchPostingsListCache() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SearchPostingsListCache", reflect.TypeOf((*MockOptions)(nil).SearchPostingsListCache))
+}
+
 // SetReadThroughSegmentOptions mocks base method
 func (m *MockOptions) SetReadThroughSegmentOptions(value ReadThroughSegmentOptions) Options {
 	m.ctrl.T.Helper()
@@ -1764,3 +1976,59 @@ func (mr *MockOptionsMockRecorder) QueryLimits() *gomock.Call {
 	mr.mock.ctrl.T.Helper()
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "QueryLimits", reflect.TypeOf((*MockOptions)(nil).QueryLimits))
 }
+
+// SetQueryBlockWorkerPool mocks base method
+func (m *MockOptions) SetQueryBlockWorkerPool(value sync.WorkerPool) Options {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "SetQueryBlockWorkerPool", value)
+	ret0, _ := ret[0].(Options)
+	return ret0
+}
+
+// SetQueryBlockWorkerPool indicates an expected call of SetQueryBlockWorkerPool
+func (mr *MockOptionsMockRecorder) SetQueryBlockWorkerPool(value interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetQueryBlockWorkerPool", reflect.TypeOf((*MockOptions)(nil).SetQueryBlockWorkerPool), value)
+}
+
+// QueryBlockWorkerPool mocks base method
+func (m *MockOptions) QueryBlockWorkerPool() sync.WorkerPool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "QueryBlockWorkerPool")
+	ret0, _ := ret[0].(sync.WorkerPool)
+	return ret0
+}
+
+// QueryBlockWorkerPool indicates an expected call of QueryBlockWorkerPool
+func (mr *MockOptionsMockRecorder) QueryBlockWorkerPool() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "QueryBlockWorkerPool", reflect.TypeOf((*MockOptions)(nil).QueryBlockWorkerPool))
+}
+
+// SetQueryBlockSegmentWorkerPool mocks base method
+func (m *MockOptions) SetQueryBlockSegmentWorkerPool(value sync.WorkerPool) Options {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "SetQueryBlockSegmentWorkerPool", value)
+	ret0, _ := ret[0].(Options)
+	return ret0
+}
+
+// SetQueryBlockSegmentWorkerPool indicates an expected call of SetQueryBlockSegmentWorkerPool
+func (mr *MockOptionsMockRecorder) SetQueryBlockSegmentWorkerPool(value interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetQueryBlockSegmentWorkerPool", reflect.TypeOf((*MockOptions)(nil).SetQueryBlockSegmentWorkerPool), value)
+}
+
+// QueryBlockSegmentWorkerPool mocks base method
+func (m *MockOptions) QueryBlockSegmentWorkerPool() sync.WorkerPool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "QueryBlockSegmentWorkerPool")
+	ret0, _ := ret[0].(sync.WorkerPool)
+	return ret0
+}
+
+// QueryBlockSegmentWorkerPool indicates an expected call of QueryBlockSegmentWorkerPool
+func (mr *MockOptionsMockRecorder) QueryBlockSegmentWorkerPool() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "QueryBlockSegmentWorkerPool", reflect.TypeOf((*MockOptions)(nil).QueryBlockSegmentWorkerPool))
+}
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index b32a7d5c0c..45abb7cba9 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -97,6 +97,7 @@ type mutableSegments struct {
 
 	sealedBlockStarts          map[xtime.UnixNano]struct{}
 	backgroundCompactGCPending bool
+	backgroundCompactDisable   bool
 
 	metrics mutableSegmentsMetrics
 	logger  *zap.Logger
@@ -141,10 +142,10 @@ type mutableSegmentsMetrics struct {
 	activeBlockGarbageCollectCachedSearchesMatched              tally.Histogram
 	activeBlockGarbageCollectReconstructCachedSearchEvalSkip    tally.Counter
 	activeBlockGarbageCollectReconstructCachedSearchEvalAttempt tally.Counter
-	activeBlockGarbageCollectReconstructCachedSearchSuccess     tally.Counter
-	activeBlockGarbageCollectReconstructCachedSearchError       tally.Counter
 	activeBlockGarbageCollectReconstructCachedSearchCacheHit    tally.Counter
 	activeBlockGarbageCollectReconstructCachedSearchCacheMiss   tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchExecSuccess tally.Counter
+	activeBlockGarbageCollectReconstructCachedSearchExecError   tally.Counter
 }
 
 func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
@@ -175,22 +176,22 @@ func newMutableSegmentsMetrics(s tally.Scope) mutableSegmentsMetrics {
 			append(tally.ValueBuckets{0, 1}, tally.MustMakeExponentialValueBuckets(2, 2, 12)...)),
 		activeBlockGarbageCollectReconstructCachedSearchEvalSkip: backgroundScope.Tagged(map[string]string{
 			"eval_type": "skip",
-		}).Counter("gc-reconstruct-cached-search"),
+		}).Counter("gc-reconstruct-cached-search-eval"),
 		activeBlockGarbageCollectReconstructCachedSearchEvalAttempt: backgroundScope.Tagged(map[string]string{
 			"eval_type": "attempt",
-		}).Counter("gc-reconstruct-cached-search"),
-		activeBlockGarbageCollectReconstructCachedSearchSuccess: backgroundScope.Tagged(map[string]string{
-			"result_type": "success",
-		}).Counter("gc-reconstruct-cached-search"),
-		activeBlockGarbageCollectReconstructCachedSearchError: backgroundScope.Tagged(map[string]string{
-			"result_type": "error",
-		}).Counter("gc-reconstruct-cached-search"),
+		}).Counter("gc-reconstruct-cached-search-eval"),
 		activeBlockGarbageCollectReconstructCachedSearchCacheHit: backgroundScope.Tagged(map[string]string{
 			"result_type": "cache_hit",
 		}).Counter("gc-reconstruct-cached-search-cache-result"),
 		activeBlockGarbageCollectReconstructCachedSearchCacheMiss: backgroundScope.Tagged(map[string]string{
 			"result_type": "cache_miss",
 		}).Counter("gc-reconstruct-cached-search-cache-result"),
+		activeBlockGarbageCollectReconstructCachedSearchExecSuccess: backgroundScope.Tagged(map[string]string{
+			"result_type": "success",
+		}).Counter("gc-reconstruct-cached-search-exec-result"),
+		activeBlockGarbageCollectReconstructCachedSearchExecError: backgroundScope.Tagged(map[string]string{
+			"result_type": "error",
+		}).Counter("gc-reconstruct-cached-search-exec-result"),
 	}
 }
 
@@ -470,10 +471,17 @@ func (m *mutableSegments) Close() {
 }
 
 func (m *mutableSegments) maybeBackgroundCompactWithLock() {
+	if m.backgroundCompactDisable {
+		return
+	}
 	if m.compact.compactingBackgroundStandard {
 		return
 	}
 
+	m.backgroundCompactWithLock()
+}
+
+func (m *mutableSegments) backgroundCompactWithLock() {
 	// Create a logical plan.
 	segs := make([]compaction.Segment, 0, len(m.backgroundSegments))
 	for _, seg := range m.backgroundSegments {
@@ -951,10 +959,10 @@ func (m *mutableSegments) populateCachedSearches(
 			return func() error {
 				e := fn()
 				if e != nil {
-					m.metrics.activeBlockGarbageCollectReconstructCachedSearchError.Inc(1)
+					m.metrics.activeBlockGarbageCollectReconstructCachedSearchExecError.Inc(1)
 					return e
 				}
-				m.metrics.activeBlockGarbageCollectReconstructCachedSearchSuccess.Inc(1)
+				m.metrics.activeBlockGarbageCollectReconstructCachedSearchExecSuccess.Inc(1)
 				return nil
 			}
 		}
diff --git a/src/dbnode/storage/index/mutable_segments_test.go b/src/dbnode/storage/index/mutable_segments_test.go
new file mode 100644
index 0000000000..6505087e05
--- /dev/null
+++ b/src/dbnode/storage/index/mutable_segments_test.go
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package index
+
+import (
+	"testing"
+	"time"
+
+	"github.com/m3db/m3/src/dbnode/namespace"
+	xsync "github.com/m3db/m3/src/x/sync"
+	xtest "github.com/m3db/m3/src/x/test"
+
+	"github.com/stretchr/testify/require"
+)
+
+func newTestMutableSegments(
+	t *testing.T,
+	md namespace.Metadata,
+	blockStart time.Time,
+) *mutableSegments {
+	cachedSearchesWorkers := xsync.NewWorkerPool(2)
+	cachedSearchesWorkers.Init()
+
+	segs, err := newMutableSegments(md, blockStart, testOpts, BlockOptions{},
+		cachedSearchesWorkers, namespace.NewRuntimeOptionsManager("foo"),
+		testOpts.InstrumentOptions())
+	require.NoError(t, err)
+
+	return segs
+}
+
+func TestMutableSegmentsBackgroundCompact(t *testing.T) {
+	ctrl := xtest.NewController(t)
+	defer ctrl.Finish()
+
+	blockSize := time.Hour
+	testMD := newTestNSMetadata(t)
+	blockStart := time.Now().Truncate(blockSize)
+
+	nowNotBlockStartAligned := blockStart.Add(time.Minute)
+
+	segs := newTestMutableSegments(t, testMD, blockStart)
+	segs.backgroundCompactDisable = true // Disable to explicitly test.
+
+	batch := NewWriteBatch(WriteBatchOptions{
+		IndexBlockSize: blockSize,
+	})
+
+	for i := 0; i < 32; i++ {
+		batch.Append(WriteBatchEntry{
+			Timestamp: nowNotBlockStartAligned,
+		}, testDocN(i))
+	}
+
+	_, err := segs.WriteBatch(batch)
+	require.NoError(t, err)
+}
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index 6d032773ec..b90de34146 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -21,9 +21,7 @@
 package index
 
 import (
-	"bytes"
 	"errors"
-	"sync"
 	"time"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
@@ -33,7 +31,6 @@ import (
 	"github.com/m3db/m3/src/x/instrument"
 
 	"github.com/cespare/xxhash/v2"
-	"github.com/dgraph-io/ristretto"
 	"github.com/pborman/uuid"
 	"github.com/uber-go/tally"
 	"go.uber.org/zap"
@@ -63,7 +60,6 @@ const (
 
 	reportLoopInterval = 10 * time.Second
 	emptyPattern       = ""
-	lruTTL             = 15 * time.Minute
 )
 
 // PostingsListCacheOptions is the options struct for the query cache.
@@ -85,47 +81,15 @@ func (o PostingsListCacheOptions) Validate() error {
 
 // PostingsListCache implements an LRU for caching queries and their results.
 type PostingsListCache struct {
-	lru *ristretto.Cache
+	lru *postingsListLRU
 
 	size    int
 	opts    PostingsListCacheOptions
 	metrics *postingsListCacheMetrics
 
-	registry postingsListCacheRegistry
-
 	logger *zap.Logger
 }
 
-type postingsListCacheRegistry struct {
-	sync.RWMutex
-	eventCh chan postingsListEvent
-	active  map[uuid.Array]map[registryKey]registryValue
-}
-
-type registryKey struct {
-	field          string
-	pattern        string
-	patternType    PatternType
-	searchQueryKey string
-}
-
-type registryValue struct {
-	searchQuery *querypb.Query
-	postings    postings.List
-}
-
-type postingsListEventType int
-
-const (
-	addEventType postingsListEventType = iota
-	removeEventType
-)
-
-type postingsListEvent struct {
-	eventType      postingsListEventType
-	cachedPostings *cachedPostings
-}
-
 // NewPostingsListCache creates a new query cache.
 func NewPostingsListCache(
 	size int,
@@ -136,45 +100,23 @@ func NewPostingsListCache(
 		return nil, nil, err
 	}
 
+	lru, err := newPostingsListLRU(size)
+	if err != nil {
+		return nil, nil, err
+	}
+
 	plc := &PostingsListCache{
-		size: size,
-		opts: opts,
-		registry: postingsListCacheRegistry{
-			eventCh: make(chan postingsListEvent, 4096),
-			active:  make(map[uuid.Array]map[registryKey]registryValue),
-		},
+		lru:     lru,
+		size:    size,
+		opts:    opts,
 		metrics: newPostingsListCacheMetrics(opts.InstrumentOptions.MetricsScope()),
 		logger:  opts.InstrumentOptions.Logger(),
 	}
-	plc.lru, err = ristretto.NewCache(&ristretto.Config{
-		NumCounters: int64(10 * size), // number of keys to track frequency of.
-		MaxCost:     int64(size),      // maximum cost of cache.
-		BufferItems: 64,               // number of keys per Get buffer.
-		KeyToHash: func(k interface{}) (uint64, uint64) {
-			return k.(uint64), 0
-		},
-		OnEvict: plc.onEvict,
-	})
-	if err != nil {
-		return nil, nil, err
-	}
 
 	closer := plc.startLoop()
 	return plc, closer, nil
 }
 
-func (q *PostingsListCache) onEvict(key, conflict uint64, value interface{}, cost int64) {
-	v, ok := value.(*cachedPostings)
-	if !ok {
-		return
-	}
-
-	q.registry.eventCh <- postingsListEvent{
-		eventType:      removeEventType,
-		cachedPostings: v,
-	}
-}
-
 // GetRegexp returns the cached results for the provided regexp query, if any.
 func (q *PostingsListCache) GetRegexp(
 	segmentUUID uuid.UUID,
@@ -215,23 +157,13 @@ func (q *PostingsListCache) get(
 	pattern string,
 	patternType PatternType,
 ) (postings.List, bool) {
-	var pl *cachedPostings
-	entry, ok := q.lru.Get(keyHash(segmentUUID, field, pattern, patternType))
-	if ok {
-		pl = entry.(*cachedPostings)
-		ok = bytes.Equal(segmentUUID, pl.segmentUUID) &&
-			field == pl.field &&
-			pattern == pl.pattern &&
-			patternType == pl.patternType
-	}
-
+	entry, ok := q.lru.Get(segmentUUID, field, pattern, patternType)
 	q.emitCacheGetMetrics(patternType, ok)
-
 	if !ok {
 		return nil, false
 	}
 
-	return pl.postings, ok
+	return entry.postings, ok
 }
 
 type cachedPostings struct {
@@ -344,7 +276,6 @@ func (q *PostingsListCache) put(
 		searchQueryProto = searchQuery.ToProto()
 	}
 
-	key := keyHash(segmentUUID, field, pattern, patternType)
 	value := &cachedPostings{
 		segmentUUID: segmentUUID,
 		field:       field,
@@ -353,12 +284,15 @@ func (q *PostingsListCache) put(
 		searchQuery: searchQueryProto,
 		postings:    pl,
 	}
-	q.lru.SetWithTTL(key, value, 1, lruTTL)
+	q.lru.Add(segmentUUID, field, pattern, patternType, value)
+
 	q.emitCachePutMetrics(patternType)
-	q.registry.eventCh <- postingsListEvent{
-		eventType:      addEventType,
-		cachedPostings: value,
-	}
+}
+
+// PurgeSegment removes all postings lists associated with the specified
+// segment from the cache.
+func (q *PostingsListCache) PurgeSegment(segmentUUID uuid.UUID) {
+	q.lru.PurgeSegment(segmentUUID)
 }
 
 // startLoop starts a background process that will call Report()
@@ -380,33 +314,6 @@ func (q *PostingsListCache) startLoop() Closer {
 		}
 	}()
 
-	go func() {
-		for {
-			// Process first without lock (just wait blindly).
-			var ev postingsListEvent
-			select {
-			case <-doneCh:
-				return
-			case ev = <-q.registry.eventCh:
-			}
-
-			// Now acquire lock and process as many as can while batched.
-			q.registry.Lock()
-			// Process first.
-			q.processEventWithLock(ev)
-			// Process as many while holding lock until no more to read.
-			for more := true; more; {
-				select {
-				case ev = <-q.registry.eventCh:
-					q.processEventWithLock(ev)
-				default:
-					more = false
-				}
-			}
-			q.registry.Unlock()
-		}
-	}()
-
 	return func() { close(doneCh) }
 }
 
@@ -438,10 +345,10 @@ func (q *PostingsListCache) CachedPatterns(
 ) CachedPatternsResult {
 	var result CachedPatternsResult
 
-	q.registry.RLock()
-	defer q.registry.RUnlock()
+	q.lru.RLock()
+	defer q.lru.RUnlock()
 
-	segmentPostings, ok := q.registry.active[uuid.Array()]
+	segmentPostings, ok := q.lru.items[uuid.Array()]
 	if !ok {
 		return result
 	}
@@ -457,9 +364,9 @@ func (q *PostingsListCache) CachedPatterns(
 			Field:          key.field,
 			Pattern:        key.pattern,
 			PatternType:    key.patternType,
-			SearchQueryKey: key.searchQueryKey,
-			SearchQuery:    value.searchQuery,
-			Postings:       value.postings,
+			SearchQueryKey: value.Value.(*entry).cachedPostings.searchQueryKey,
+			SearchQuery:    value.Value.(*entry).cachedPostings.searchQuery,
+			Postings:       value.Value.(*entry).cachedPostings.postings,
 		})
 		result.MatchedPatterns++
 	}
@@ -467,37 +374,6 @@ func (q *PostingsListCache) CachedPatterns(
 	return result
 }
 
-func (q *PostingsListCache) processEventWithLock(ev postingsListEvent) {
-	uuid := ev.cachedPostings.segmentUUID.Array()
-	key := registryKey{
-		field:          ev.cachedPostings.field,
-		pattern:        ev.cachedPostings.pattern,
-		patternType:    ev.cachedPostings.patternType,
-		searchQueryKey: ev.cachedPostings.searchQueryKey,
-	}
-	value := registryValue{
-		searchQuery: ev.cachedPostings.searchQuery,
-		postings:    ev.cachedPostings.postings,
-	}
-	segmentPostings, ok := q.registry.active[uuid]
-	if !ok {
-		segmentPostings = make(map[registryKey]registryValue)
-		q.registry.active[uuid] = segmentPostings
-	}
-
-	switch ev.eventType {
-	case removeEventType:
-		delete(segmentPostings, key)
-		if len(segmentPostings) == 0 {
-			delete(q.registry.active, uuid)
-		}
-	case addEventType:
-		segmentPostings[key] = value
-	}
-
-	q.emitRegistryMetrics(ev.cachedPostings.patternType, ev.eventType)
-}
-
 // Report will emit metrics about the status of the cache.
 func (q *PostingsListCache) Report() {
 	q.metrics.capacity.Update(float64(q.size))
@@ -539,31 +415,6 @@ func (q *PostingsListCache) emitCachePutMetrics(patternType PatternType) {
 	}
 }
 
-func (q *PostingsListCache) emitRegistryMetrics(
-	patternType PatternType,
-	eventType postingsListEventType,
-) {
-	var method *postingsListCacheMethodMetrics
-	switch patternType {
-	case PatternTypeRegexp:
-		method = q.metrics.regexp
-	case PatternTypeTerm:
-		method = q.metrics.term
-	case PatternTypeField:
-		method = q.metrics.field
-	case PatternTypeSearch:
-		method = q.metrics.search
-	default:
-		method = q.metrics.unknown // should never happen
-	}
-	switch eventType {
-	case removeEventType:
-		method.registryRemoves.Inc(1)
-	case addEventType:
-		method.registryAdds.Inc(1)
-	}
-}
-
 type postingsListCacheMetrics struct {
 	regexp  *postingsListCacheMethodMetrics
 	term    *postingsListCacheMethodMetrics
@@ -611,19 +462,15 @@ func newPostingsListCacheMetrics(scope tally.Scope) *postingsListCacheMetrics {
 }
 
 type postingsListCacheMethodMetrics struct {
-	hits            tally.Counter
-	misses          tally.Counter
-	puts            tally.Counter
-	registryAdds    tally.Counter
-	registryRemoves tally.Counter
+	hits   tally.Counter
+	misses tally.Counter
+	puts   tally.Counter
 }
 
 func newPostingsListCacheMethodMetrics(scope tally.Scope) *postingsListCacheMethodMetrics {
 	return &postingsListCacheMethodMetrics{
-		hits:            scope.Counter("hits"),
-		misses:          scope.Counter("misses"),
-		puts:            scope.Counter("puts"),
-		registryAdds:    scope.Counter("registry_adds"),
-		registryRemoves: scope.Counter("registry_removes"),
+		hits:   scope.Counter("hits"),
+		misses: scope.Counter("misses"),
+		puts:   scope.Counter("puts"),
 	}
 }
diff --git a/src/dbnode/storage/index/postings_list_cache_lru.go b/src/dbnode/storage/index/postings_list_cache_lru.go
new file mode 100644
index 0000000000..faa6e3c9bf
--- /dev/null
+++ b/src/dbnode/storage/index/postings_list_cache_lru.go
@@ -0,0 +1,235 @@
+// Copyright (c) 2019 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package index
+
+import (
+	"container/list"
+	"errors"
+	"sync"
+
+	"github.com/pborman/uuid"
+)
+
+// PostingsListLRU implements a non-thread safe fixed size LRU cache of postings lists
+// that were resolved by running a given query against a particular segment for a given
+// field and pattern type (term vs regexp). Normally a key in the LRU would look like:
+//
+// type key struct {
+//    segmentUUID uuid.UUID
+//    field       string
+//    pattern     string
+//    patternType PatternType
+// }
+//
+// However, some of the postings lists that we will store in the LRU have a fixed lifecycle
+// because they reference mmap'd byte slices which will eventually be unmap'd. To prevent
+// these postings lists that point to unmap'd regions from remaining in the LRU, we want to
+// support the ability to efficiently purge the LRU of any postings list that belong to a
+// given segment. This isn't technically required for correctness as once a segment has been
+// closed, its old postings list in the LRU will never be accessed again (since they are only
+// addressable by that segments UUID), but we purge them from the LRU before closing the segment
+// anyways as an additional safety precaution.
+//
+// Instead of adding additional tracking on-top of an existing generic LRU, we've created a
+// specialized LRU that instead of having a single top-level map pointing into the linked-list,
+// has a two-level map where the top level map is keyed by segment UUID and the second level map
+// is keyed by the field/pattern/patternType.
+//
+// As a result, when a segment is ready to be closed, they can call into the cache with their
+// UUID and we can efficiently remove all the entries corresponding to that segment from the
+// LRU. The specialization has the additional nice property that we don't need to allocate everytime
+// we add an item to the LRU due to the interface{} conversion.
+type postingsListLRU struct {
+	sync.RWMutex
+	size      int
+	evictList *list.List
+	items     map[uuid.Array]map[key]*list.Element
+}
+
+// entry is used to hold a value in the evictList.
+type entry struct {
+	uuid           uuid.UUID
+	key            key
+	cachedPostings *cachedPostings
+}
+
+type key struct {
+	field       string
+	pattern     string
+	patternType PatternType
+}
+
+// newPostingsListLRU constructs an LRU of the given size.
+func newPostingsListLRU(size int) (*postingsListLRU, error) {
+	if size <= 0 {
+		return nil, errors.New("Must provide a positive size")
+	}
+
+	return &postingsListLRU{
+		size:      size,
+		evictList: list.New(),
+		items:     make(map[uuid.Array]map[key]*list.Element),
+	}, nil
+}
+
+// Add adds a value to the cache. Returns true if an eviction occurred.
+func (c *postingsListLRU) Add(
+	segmentUUID uuid.UUID,
+	field string,
+	pattern string,
+	patternType PatternType,
+	cachedPostings *cachedPostings,
+) (evicted bool) {
+	c.Lock()
+	defer c.Unlock()
+
+	newKey := newKey(field, pattern, patternType)
+	// Check for existing item.
+	uuidArray := segmentUUID.Array()
+	if uuidEntries, ok := c.items[uuidArray]; ok {
+		if ent, ok := uuidEntries[newKey]; ok {
+			// If it already exists, just move it to the front. This avoids storing
+			// the same item in the LRU twice which is important because the maps
+			// can only point to one entry at a time and we use them for purges. Also,
+			// it saves space by avoiding storing duplicate values.
+			c.evictList.MoveToFront(ent)
+			ent.Value.(*entry).cachedPostings = cachedPostings
+			return false
+		}
+	}
+
+	// Add new item.
+	var (
+		ent = &entry{
+			uuid:           segmentUUID,
+			key:            newKey,
+			cachedPostings: cachedPostings,
+		}
+		entry = c.evictList.PushFront(ent)
+	)
+	if queries, ok := c.items[uuidArray]; ok {
+		queries[newKey] = entry
+	} else {
+		c.items[uuidArray] = map[key]*list.Element{
+			newKey: entry,
+		}
+	}
+
+	evict := c.evictList.Len() > c.size
+	// Verify size not exceeded.
+	if evict {
+		c.removeOldest()
+	}
+	return evict
+}
+
+// Get looks up a key's value from the cache.
+func (c *postingsListLRU) Get(
+	segmentUUID uuid.UUID,
+	field string,
+	pattern string,
+	patternType PatternType,
+) (*cachedPostings, bool) {
+	c.Lock()
+	defer c.Unlock()
+
+	newKey := newKey(field, pattern, patternType)
+	uuidArray := segmentUUID.Array()
+
+	uuidEntries, ok := c.items[uuidArray]
+	if !ok {
+		return nil, false
+	}
+
+	ent, ok := uuidEntries[newKey]
+	if !ok {
+		return nil, false
+	}
+
+	c.evictList.MoveToFront(ent)
+	return ent.Value.(*entry).cachedPostings, true
+}
+
+// Remove removes the provided key from the cache, returning if the
+// key was contained.
+func (c *postingsListLRU) Remove(
+	segmentUUID uuid.UUID,
+	field string,
+	pattern string,
+	patternType PatternType,
+) bool {
+	c.Lock()
+	defer c.Unlock()
+
+	newKey := newKey(field, pattern, patternType)
+	uuidArray := segmentUUID.Array()
+	if uuidEntries, ok := c.items[uuidArray]; ok {
+		if ent, ok := uuidEntries[newKey]; ok {
+			c.removeElement(ent)
+			return true
+		}
+	}
+
+	return false
+}
+
+func (c *postingsListLRU) PurgeSegment(segmentUUID uuid.UUID) {
+	c.Lock()
+	defer c.Unlock()
+
+	if uuidEntries, ok := c.items[segmentUUID.Array()]; ok {
+		for _, ent := range uuidEntries {
+			c.removeElement(ent)
+		}
+	}
+}
+
+// Len returns the number of items in the cache.
+func (c *postingsListLRU) Len() int {
+	c.RLock()
+	defer c.RUnlock()
+	return c.evictList.Len()
+}
+
+// removeOldest removes the oldest item from the cache.
+func (c *postingsListLRU) removeOldest() {
+	ent := c.evictList.Back()
+	if ent != nil {
+		c.removeElement(ent)
+	}
+}
+
+// removeElement is used to remove a given list element from the cache
+func (c *postingsListLRU) removeElement(e *list.Element) {
+	c.evictList.Remove(e)
+	entry := e.Value.(*entry)
+
+	if patterns, ok := c.items[entry.uuid.Array()]; ok {
+		delete(patterns, entry.key)
+		if len(patterns) == 0 {
+			delete(c.items, entry.uuid.Array())
+		}
+	}
+}
+
+func newKey(field, pattern string, patternType PatternType) key {
+	return key{field: field, pattern: pattern, patternType: patternType}
+}
diff --git a/src/dbnode/storage/index/postings_list_cache_lru_test.go b/src/dbnode/storage/index/postings_list_cache_lru_test.go
new file mode 100644
index 0000000000..2fb384e767
--- /dev/null
+++ b/src/dbnode/storage/index/postings_list_cache_lru_test.go
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package index
+
+// Keys returns a slice of the keys in the cache, from oldest to newest. Used for
+// testing only.
+func (c *postingsListLRU) keys() []key {
+	keys := make([]key, 0, len(c.items))
+	for ent := c.evictList.Back(); ent != nil; ent = ent.Prev() {
+		entry := ent.Value.(*entry)
+		keys = append(keys, entry.key)
+	}
+	return keys
+}
diff --git a/src/dbnode/storage/index/postings_list_cache_test.go b/src/dbnode/storage/index/postings_list_cache_test.go
index e69fa29842..e4edd01a3f 100644
--- a/src/dbnode/storage/index/postings_list_cache_test.go
+++ b/src/dbnode/storage/index/postings_list_cache_test.go
@@ -71,7 +71,7 @@ func init() {
 
 		testPlEntries = append(testPlEntries, testEntry{
 			segmentUUID:  segmentUUID,
-			key:          keyHash(segmentUUID, field, pattern, patternType),
+			key:          newKey(field, pattern, patternType),
 			postingsList: pl,
 		})
 	}
@@ -79,7 +79,7 @@ func init() {
 
 type testEntry struct {
 	segmentUUID  uuid.UUID
-	key          uint64
+	key          key
 	postingsList postings.List
 }
 
@@ -273,70 +273,70 @@ func testConcurrency(t *testing.T, size int, purge bool, verify bool) {
 	}
 }
 
-func putEntry(t *testing.T, cache *PostingsListCache, entry testEntry) {
+func putEntry(t *testing.T, cache *PostingsListCache, i int) {
 	// Do each put twice to test the logic that avoids storing
 	// multiple entries for the same value.
 	switch testPlEntries[i].key.patternType {
 	case PatternTypeRegexp:
 		cache.PutRegexp(
-			entry.segmentUUID,
-			entry.key.field,
-			entry.key.pattern,
-			entry.postingsList,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
+			testPlEntries[i].key.pattern,
+			testPlEntries[i].postingsList,
 		)
 		cache.PutRegexp(
-			entry.segmentUUID,
-			entry.key.field,
-			entry.key.pattern,
-			entry.postingsList,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
+			testPlEntries[i].key.pattern,
+			testPlEntries[i].postingsList,
 		)
 	case PatternTypeTerm:
 		cache.PutTerm(
-			entry.segmentUUID,
-			entry.key.field,
-			entry.key.pattern,
-			entry.postingsList,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
+			testPlEntries[i].key.pattern,
+			testPlEntries[i].postingsList,
 		)
 		cache.PutTerm(
-			entry.segmentUUID,
-			entry.key.field,
-			entry.key.pattern,
-			entry.postingsList,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
+			testPlEntries[i].key.pattern,
+			testPlEntries[i].postingsList,
 		)
 	case PatternTypeField:
 		cache.PutField(
-			entry.segmentUUID,
-			entry.key.field,
-			entry.postingsList,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
+			testPlEntries[i].postingsList,
 		)
 		cache.PutField(
-			entry.segmentUUID,
-			entry.key.field,
-			entry.postingsList,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
+			testPlEntries[i].postingsList,
 		)
 	default:
 		require.FailNow(t, "unknown pattern type", testPlEntries[i].key.patternType)
 	}
 }
 
-func getEntry(t *testing.T, cache *PostingsListCache, entry testEntry) (postings.List, bool) {
+func getEntry(t *testing.T, cache *PostingsListCache, i int) (postings.List, bool) {
 	switch testPlEntries[i].key.patternType {
 	case PatternTypeRegexp:
 		return cache.GetRegexp(
-			entry.segmentUUID,
-			entry.key.field,
-			entry.key.pattern,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
+			testPlEntries[i].key.pattern,
 		)
 	case PatternTypeTerm:
 		return cache.GetTerm(
-			entry.segmentUUID,
-			entry.key.field,
-			entry.key.pattern,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
+			testPlEntries[i].key.pattern,
 		)
 	case PatternTypeField:
 		return cache.GetField(
-			entry.segmentUUID,
-			entry.key.field,
+			testPlEntries[i].segmentUUID,
+			testPlEntries[i].key.field,
 		)
 	default:
 		require.FailNow(t, "unknown pattern type", testPlEntries[i].key.patternType)
@@ -344,11 +344,9 @@ func getEntry(t *testing.T, cache *PostingsListCache, entry testEntry) (postings
 	return nil, false
 }
 
-func requireContains(t *testing.T, plCache *PostingsListCache, values []testEntry) {
-	// Wait for registry to catchup.
-	for _, value := range values {
-		_, ok := getEntry(t, plCache, value)
-		require.True(t, ok)
+func requireExpectedOrder(t *testing.T, plCache *PostingsListCache, expectedOrder []testEntry) {
+	for i, key := range plCache.lru.keys() {
+		require.Equal(t, expectedOrder[i].key, key)
 	}
 }
 
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index 395617e97d..e8b81bbbd9 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -123,6 +123,19 @@ func (r *ReadThroughSegment) Close() error {
 
 	r.closed = true
 
+	if cache := r.caches.SegmentPostingsListCache; cache != nil {
+		// Purge segments from the cache before closing the segment to avoid
+		// temporarily having postings lists in the cache whose underlying
+		// bytes are no longer mmap'd.
+		cache.PurgeSegment(r.uuid)
+	}
+	if cache := r.caches.SearchPostingsListCache; cache != nil {
+		// Purge segments from the cache before closing the segment to avoid
+		// temporarily having postings lists in the cache whose underlying
+		// bytes are no longer mmap'd.
+		cache.PurgeSegment(r.uuid)
+	}
+
 	return r.segment.Close()
 }
 
diff --git a/src/dbnode/storage/index/read_through_segment_test.go b/src/dbnode/storage/index/read_through_segment_test.go
index cac33fb8d4..933de38173 100644
--- a/src/dbnode/storage/index/read_through_segment_test.go
+++ b/src/dbnode/storage/index/read_through_segment_test.go
@@ -40,6 +40,16 @@ var (
 	}
 )
 
+func testReadThroughSegmentCaches(
+	segmentPostingsListCache *PostingsListCache,
+	searchPostingsListCache *PostingsListCache,
+) ReadThroughSegmentCaches {
+	return ReadThroughSegmentCaches{
+		SegmentPostingsListCache: segmentPostingsListCache,
+		SearchPostingsListCache:  searchPostingsListCache,
+	}
+}
+
 func TestReadThroughSegmentMatchRegexp(t *testing.T) {
 	ctrl := gomock.NewController(t)
 	defer ctrl.Finish()
@@ -59,8 +69,9 @@ func TestReadThroughSegmentMatchRegexp(t *testing.T) {
 		FSTSyntax: parsedRegex,
 	}
 
-	readThrough, err := NewReadThroughSegment(
-		seg, cache, defaultReadThroughSegmentOptions).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(cache, nil),
+		defaultReadThroughSegmentOptions).Reader()
 	require.NoError(t, err)
 
 	originalPL := roaring.NewPostingsList()
@@ -98,9 +109,12 @@ func TestReadThroughSegmentMatchRegexpCacheDisabled(t *testing.T) {
 		FSTSyntax: parsedRegex,
 	}
 
-	readThrough, err := NewReadThroughSegment(seg, cache, ReadThroughSegmentOptions{
-		CacheRegexp: false,
-	}).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(cache, nil),
+		ReadThroughSegmentOptions{
+			CacheRegexp: false,
+		}).
+		Reader()
 	require.NoError(t, err)
 
 	originalPL := roaring.NewPostingsList()
@@ -139,8 +153,10 @@ func TestReadThroughSegmentMatchRegexpNoCache(t *testing.T) {
 		FSTSyntax: parsedRegex,
 	}
 
-	readThrough, err := NewReadThroughSegment(
-		seg, nil, defaultReadThroughSegmentOptions).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(nil, nil),
+		defaultReadThroughSegmentOptions).
+		Reader()
 	require.NoError(t, err)
 
 	originalPL := roaring.NewPostingsList()
@@ -173,8 +189,10 @@ func TestReadThroughSegmentMatchTerm(t *testing.T) {
 	)
 	require.NoError(t, originalPL.Insert(1))
 
-	readThrough, err := NewReadThroughSegment(
-		seg, cache, defaultReadThroughSegmentOptions).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(cache, nil),
+		defaultReadThroughSegmentOptions).
+		Reader()
 	require.NoError(t, err)
 
 	reader.EXPECT().MatchTerm(field, term).Return(originalPL, nil)
@@ -211,9 +229,12 @@ func TestReadThroughSegmentMatchTermCacheDisabled(t *testing.T) {
 	)
 	require.NoError(t, originalPL.Insert(1))
 
-	readThrough, err := NewReadThroughSegment(seg, cache, ReadThroughSegmentOptions{
-		CacheTerms: false,
-	}).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(cache, nil),
+		ReadThroughSegmentOptions{
+			CacheTerms: false,
+		}).
+		Reader()
 	require.NoError(t, err)
 
 	reader.EXPECT().
@@ -250,8 +271,10 @@ func TestReadThroughSegmentMatchTermNoCache(t *testing.T) {
 
 	seg.EXPECT().Reader().Return(reader, nil)
 
-	readThrough, err := NewReadThroughSegment(
-		seg, nil, defaultReadThroughSegmentOptions).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(nil, nil),
+		defaultReadThroughSegmentOptions).
+		Reader()
 	require.NoError(t, err)
 
 	reader.EXPECT().MatchTerm(field, term).Return(originalPL, nil)
@@ -271,10 +294,11 @@ func TestClose(t *testing.T) {
 	require.NoError(t, err)
 	defer stopReporting()
 
-	readThroughSeg := NewReadThroughSegment(
-		segment, cache, defaultReadThroughSegmentOptions)
+	readThroughSeg := NewReadThroughSegment(segment,
+		testReadThroughSegmentCaches(nil, nil),
+		defaultReadThroughSegmentOptions)
 
-	segmentUUID := readThroughSeg.(*ReadThroughSegment).uuid
+	segmentUUID := readThroughSeg.uuid
 
 	// Store an entry for the segment in the cache so we can check if it
 	// gets purged after.
@@ -283,7 +307,7 @@ func TestClose(t *testing.T) {
 	segment.EXPECT().Close().Return(nil)
 	err = readThroughSeg.Close()
 	require.NoError(t, err)
-	require.True(t, readThroughSeg.(*ReadThroughSegment).closed)
+	require.True(t, readThroughSeg.closed)
 
 	// Make sure it does not allow double closes.
 	err = readThroughSeg.Close()
@@ -313,8 +337,10 @@ func TestReadThroughSegmentMatchField(t *testing.T) {
 	)
 	require.NoError(t, originalPL.Insert(1))
 
-	readThrough, err := NewReadThroughSegment(
-		seg, cache, defaultReadThroughSegmentOptions).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(cache, nil),
+		defaultReadThroughSegmentOptions).
+		Reader()
 	require.NoError(t, err)
 
 	reader.EXPECT().MatchField(field).Return(originalPL, nil)
@@ -350,9 +376,12 @@ func TestReadThroughSegmentMatchFieldCacheDisabled(t *testing.T) {
 	)
 	require.NoError(t, originalPL.Insert(1))
 
-	readThrough, err := NewReadThroughSegment(seg, cache, ReadThroughSegmentOptions{
-		CacheTerms: false,
-	}).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(cache, nil),
+		ReadThroughSegmentOptions{
+			CacheTerms: false,
+		}).
+		Reader()
 	require.NoError(t, err)
 
 	reader.EXPECT().
@@ -388,8 +417,10 @@ func TestReadThroughSegmentMatchFieldNoCache(t *testing.T) {
 
 	seg.EXPECT().Reader().Return(reader, nil)
 
-	readThrough, err := NewReadThroughSegment(
-		seg, nil, defaultReadThroughSegmentOptions).Reader()
+	readThrough, err := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(nil, nil),
+		defaultReadThroughSegmentOptions).
+		Reader()
 	require.NoError(t, err)
 
 	reader.EXPECT().MatchField(field).Return(originalPL, nil)
@@ -406,11 +437,12 @@ func TestCloseNoCache(t *testing.T) {
 
 	seg := fst.NewMockSegment(ctrl)
 
-	readThrough := NewReadThroughSegment(
-		seg, nil, defaultReadThroughSegmentOptions)
+	readThrough := NewReadThroughSegment(seg,
+		testReadThroughSegmentCaches(nil, nil),
+		defaultReadThroughSegmentOptions)
 
 	seg.EXPECT().Close().Return(nil)
 	err := readThrough.Close()
 	require.NoError(t, err)
-	require.True(t, readThrough.(*ReadThroughSegment).closed)
+	require.True(t, readThrough.closed)
 }
diff --git a/src/dbnode/storage/storage_mock.go b/src/dbnode/storage/storage_mock.go
index ee619f0e00..25e64b10ff 100644
--- a/src/dbnode/storage/storage_mock.go
+++ b/src/dbnode/storage/storage_mock.go
@@ -55,7 +55,6 @@ import (
 	"github.com/m3db/m3/src/x/instrument"
 	"github.com/m3db/m3/src/x/mmap"
 	"github.com/m3db/m3/src/x/pool"
-	sync0 "github.com/m3db/m3/src/x/sync"
 	time0 "github.com/m3db/m3/src/x/time"
 
 	"github.com/golang/mock/gomock"
@@ -4622,34 +4621,6 @@ func (mr *MockOptionsMockRecorder) FetchBlocksMetadataResultsPool() *gomock.Call
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchBlocksMetadataResultsPool", reflect.TypeOf((*MockOptions)(nil).FetchBlocksMetadataResultsPool))
 }
 
-// SetQueryIDsWorkerPool mocks base method
-func (m *MockOptions) SetQueryIDsWorkerPool(value sync0.WorkerPool) Options {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "SetQueryIDsWorkerPool", value)
-	ret0, _ := ret[0].(Options)
-	return ret0
-}
-
-// SetQueryIDsWorkerPool indicates an expected call of SetQueryIDsWorkerPool
-func (mr *MockOptionsMockRecorder) SetQueryIDsWorkerPool(value interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetQueryIDsWorkerPool", reflect.TypeOf((*MockOptions)(nil).SetQueryIDsWorkerPool), value)
-}
-
-// QueryIDsWorkerPool mocks base method
-func (m *MockOptions) QueryIDsWorkerPool() sync0.WorkerPool {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "QueryIDsWorkerPool")
-	ret0, _ := ret[0].(sync0.WorkerPool)
-	return ret0
-}
-
-// QueryIDsWorkerPool indicates an expected call of QueryIDsWorkerPool
-func (mr *MockOptionsMockRecorder) QueryIDsWorkerPool() *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "QueryIDsWorkerPool", reflect.TypeOf((*MockOptions)(nil).QueryIDsWorkerPool))
-}
-
 // SetWriteBatchPool mocks base method
 func (m *MockOptions) SetWriteBatchPool(value *writes.WriteBatchPool) Options {
 	m.ctrl.T.Helper()
diff --git a/src/m3ninx/index/index_mock.go b/src/m3ninx/index/index_mock.go
index 0fac97b6dc..1621c20b74 100644
--- a/src/m3ninx/index/index_mock.go
+++ b/src/m3ninx/index/index_mock.go
@@ -205,6 +205,21 @@ func (mr *MockReaderMockRecorder) MetadataIterator(arg0 interface{}) *gomock.Cal
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MetadataIterator", reflect.TypeOf((*MockReader)(nil).MetadataIterator), arg0)
 }
 
+// NumDocs mocks base method
+func (m *MockReader) NumDocs() (int, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "NumDocs")
+	ret0, _ := ret[0].(int)
+	ret1, _ := ret[1].(error)
+	return ret0, ret1
+}
+
+// NumDocs indicates an expected call of NumDocs
+func (mr *MockReaderMockRecorder) NumDocs() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NumDocs", reflect.TypeOf((*MockReader)(nil).NumDocs))
+}
+
 // MockDocRetriever is a mock of DocRetriever interface
 type MockDocRetriever struct {
 	ctrl     *gomock.Controller
@@ -243,6 +258,21 @@ func (mr *MockDocRetrieverMockRecorder) Doc(arg0 interface{}) *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Doc", reflect.TypeOf((*MockDocRetriever)(nil).Doc), arg0)
 }
 
+// NumDocs mocks base method
+func (m *MockDocRetriever) NumDocs() (int, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "NumDocs")
+	ret0, _ := ret[0].(int)
+	ret1, _ := ret[1].(error)
+	return ret0, ret1
+}
+
+// NumDocs indicates an expected call of NumDocs
+func (mr *MockDocRetrieverMockRecorder) NumDocs() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NumDocs", reflect.TypeOf((*MockDocRetriever)(nil).NumDocs))
+}
+
 // MockMetadataRetriever is a mock of MetadataRetriever interface
 type MockMetadataRetriever struct {
 	ctrl     *gomock.Controller
diff --git a/src/m3ninx/index/segment/segment_mock.go b/src/m3ninx/index/segment/segment_mock.go
index 06e523e624..50b96da088 100644
--- a/src/m3ninx/index/segment/segment_mock.go
+++ b/src/m3ninx/index/segment/segment_mock.go
@@ -32,6 +32,7 @@ import (
 	"github.com/m3db/m3/src/m3ninx/postings"
 
 	"github.com/golang/mock/gomock"
+	"github.com/uber-go/tally"
 )
 
 // MockSegment is a mock of Segment interface
@@ -196,6 +197,21 @@ func (mr *MockReaderMockRecorder) Metadata(id interface{}) *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Metadata", reflect.TypeOf((*MockReader)(nil).Metadata), id)
 }
 
+// NumDocs mocks base method
+func (m *MockReader) NumDocs() (int, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "NumDocs")
+	ret0, _ := ret[0].(int)
+	ret1, _ := ret[1].(error)
+	return ret0, ret1
+}
+
+// NumDocs indicates an expected call of NumDocs
+func (mr *MockReaderMockRecorder) NumDocs() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NumDocs", reflect.TypeOf((*MockReader)(nil).NumDocs))
+}
+
 // Doc mocks base method
 func (m *MockReader) Doc(id postings.ID) (doc.Document, error) {
 	m.ctrl.T.Helper()
@@ -360,6 +376,21 @@ func (mr *MockReaderMockRecorder) Terms(field interface{}) *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Terms", reflect.TypeOf((*MockReader)(nil).Terms), field)
 }
 
+// FieldsPostingsList mocks base method
+func (m *MockReader) FieldsPostingsList() (FieldsPostingsListIterator, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "FieldsPostingsList")
+	ret0, _ := ret[0].(FieldsPostingsListIterator)
+	ret1, _ := ret[1].(error)
+	return ret0, ret1
+}
+
+// FieldsPostingsList indicates an expected call of FieldsPostingsList
+func (mr *MockReaderMockRecorder) FieldsPostingsList() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FieldsPostingsList", reflect.TypeOf((*MockReader)(nil).FieldsPostingsList))
+}
+
 // ContainsField mocks base method
 func (m *MockReader) ContainsField(field []byte) (bool, error) {
 	m.ctrl.T.Helper()
@@ -1803,6 +1834,18 @@ func (mr *MockSegmentsBuilderMockRecorder) AllDocs() *gomock.Call {
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AllDocs", reflect.TypeOf((*MockSegmentsBuilder)(nil).AllDocs))
 }
 
+// SetFilter mocks base method
+func (m *MockSegmentsBuilder) SetFilter(keep DocumentsFilter, filterCount tally.Counter) {
+	m.ctrl.T.Helper()
+	m.ctrl.Call(m, "SetFilter", keep, filterCount)
+}
+
+// SetFilter indicates an expected call of SetFilter
+func (mr *MockSegmentsBuilderMockRecorder) SetFilter(keep, filterCount interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetFilter", reflect.TypeOf((*MockSegmentsBuilder)(nil).SetFilter), keep, filterCount)
+}
+
 // AddSegments mocks base method
 func (m *MockSegmentsBuilder) AddSegments(segments []Segment) error {
 	m.ctrl.T.Helper()
@@ -1816,3 +1859,55 @@ func (mr *MockSegmentsBuilderMockRecorder) AddSegments(segments interface{}) *go
 	mr.mock.ctrl.T.Helper()
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddSegments", reflect.TypeOf((*MockSegmentsBuilder)(nil).AddSegments), segments)
 }
+
+// SegmentMetadatas mocks base method
+func (m *MockSegmentsBuilder) SegmentMetadatas() ([]SegmentsBuilderSegmentMetadata, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "SegmentMetadatas")
+	ret0, _ := ret[0].([]SegmentsBuilderSegmentMetadata)
+	ret1, _ := ret[1].(error)
+	return ret0, ret1
+}
+
+// SegmentMetadatas indicates an expected call of SegmentMetadatas
+func (mr *MockSegmentsBuilderMockRecorder) SegmentMetadatas() *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SegmentMetadatas", reflect.TypeOf((*MockSegmentsBuilder)(nil).SegmentMetadatas))
+}
+
+// MockDocumentsFilter is a mock of DocumentsFilter interface
+type MockDocumentsFilter struct {
+	ctrl     *gomock.Controller
+	recorder *MockDocumentsFilterMockRecorder
+}
+
+// MockDocumentsFilterMockRecorder is the mock recorder for MockDocumentsFilter
+type MockDocumentsFilterMockRecorder struct {
+	mock *MockDocumentsFilter
+}
+
+// NewMockDocumentsFilter creates a new mock instance
+func NewMockDocumentsFilter(ctrl *gomock.Controller) *MockDocumentsFilter {
+	mock := &MockDocumentsFilter{ctrl: ctrl}
+	mock.recorder = &MockDocumentsFilterMockRecorder{mock}
+	return mock
+}
+
+// EXPECT returns an object that allows the caller to indicate expected use
+func (m *MockDocumentsFilter) EXPECT() *MockDocumentsFilterMockRecorder {
+	return m.recorder
+}
+
+// Contains mocks base method
+func (m *MockDocumentsFilter) Contains(d doc.Metadata) bool {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Contains", d)
+	ret0, _ := ret[0].(bool)
+	return ret0
+}
+
+// Contains indicates an expected call of Contains
+func (mr *MockDocumentsFilterMockRecorder) Contains(d interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Contains", reflect.TypeOf((*MockDocumentsFilter)(nil).Contains), d)
+}
diff --git a/src/m3ninx/search/search_mock.go b/src/m3ninx/search/search_mock.go
index 3c327a7bba..71af5c301f 100644
--- a/src/m3ninx/search/search_mock.go
+++ b/src/m3ninx/search/search_mock.go
@@ -204,3 +204,41 @@ func (mr *MockSearcherMockRecorder) Search(arg0 interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Search", reflect.TypeOf((*MockSearcher)(nil).Search), arg0)
 }
+
+// MockReadThroughSegmentSearcher is a mock of ReadThroughSegmentSearcher interface
+type MockReadThroughSegmentSearcher struct {
+	ctrl     *gomock.Controller
+	recorder *MockReadThroughSegmentSearcherMockRecorder
+}
+
+// MockReadThroughSegmentSearcherMockRecorder is the mock recorder for MockReadThroughSegmentSearcher
+type MockReadThroughSegmentSearcherMockRecorder struct {
+	mock *MockReadThroughSegmentSearcher
+}
+
+// NewMockReadThroughSegmentSearcher creates a new mock instance
+func NewMockReadThroughSegmentSearcher(ctrl *gomock.Controller) *MockReadThroughSegmentSearcher {
+	mock := &MockReadThroughSegmentSearcher{ctrl: ctrl}
+	mock.recorder = &MockReadThroughSegmentSearcherMockRecorder{mock}
+	return mock
+}
+
+// EXPECT returns an object that allows the caller to indicate expected use
+func (m *MockReadThroughSegmentSearcher) EXPECT() *MockReadThroughSegmentSearcherMockRecorder {
+	return m.recorder
+}
+
+// Search mocks base method
+func (m *MockReadThroughSegmentSearcher) Search(query Query, searcher Searcher) (postings.List, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Search", query, searcher)
+	ret0, _ := ret[0].(postings.List)
+	ret1, _ := ret[1].(error)
+	return ret0, ret1
+}
+
+// Search indicates an expected call of Search
+func (mr *MockReadThroughSegmentSearcherMockRecorder) Search(query, searcher interface{}) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Search", reflect.TypeOf((*MockReadThroughSegmentSearcher)(nil).Search), query, searcher)
+}

From 61e6185f99fc6a5282ca5bb48cc927daa64dc663 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Tue, 13 Apr 2021 11:24:21 -0400
Subject: [PATCH 102/106] Shard postings list cache

---
 src/dbnode/storage/index/block_test.go        |  23 +++-
 .../storage/index/postings_list_cache.go      |  44 +++----
 .../storage/index/postings_list_cache_lru.go  | 119 ++++++++++++++++--
 .../index/postings_list_cache_lru_test.go     |  10 ++
 .../storage/index/postings_list_cache_test.go |   1 +
 5 files changed, 160 insertions(+), 37 deletions(-)

diff --git a/src/dbnode/storage/index/block_test.go b/src/dbnode/storage/index/block_test.go
index 7fd5eddab4..ea57b415f4 100644
--- a/src/dbnode/storage/index/block_test.go
+++ b/src/dbnode/storage/index/block_test.go
@@ -387,7 +387,9 @@ func TestBlockQueryWithCancelledQuery(t *testing.T) {
 	cancellable.Cancel()
 
 	_, err = b.Query(context.NewContext(), cancellable,
-		defaultQuery, QueryOptions{}, nil, emptyLogFields)
+		defaultQuery, QueryOptions{},
+		NewQueryResults(nil, QueryResultsOptions{}, testOpts),
+		emptyLogFields)
 	require.Error(t, err)
 	require.Equal(t, errCancelledQuery, err)
 }
@@ -407,7 +409,9 @@ func TestBlockQueryExecutorError(t *testing.T) {
 	}
 
 	_, err = b.Query(context.NewContext(), xresource.NewCancellableLifetime(),
-		defaultQuery, QueryOptions{}, nil, emptyLogFields)
+		defaultQuery, QueryOptions{},
+		NewQueryResults(nil, QueryResultsOptions{}, testOpts),
+		emptyLogFields)
 	require.Error(t, err)
 }
 
@@ -430,7 +434,9 @@ func TestBlockQuerySegmentReaderError(t *testing.T) {
 	seg.EXPECT().Reader().Return(nil, randErr)
 
 	_, err = b.Query(context.NewContext(), xresource.NewCancellableLifetime(),
-		defaultQuery, QueryOptions{}, nil, emptyLogFields)
+		defaultQuery, QueryOptions{},
+		NewQueryResults(nil, QueryResultsOptions{}, testOpts),
+		emptyLogFields)
 	require.Equal(t, randErr, err)
 }
 
@@ -470,7 +476,9 @@ func TestBlockQueryAddResultsSegmentsError(t *testing.T) {
 	seg3.EXPECT().Reader().Return(nil, randErr)
 
 	_, err = b.Query(context.NewContext(), xresource.NewCancellableLifetime(),
-		defaultQuery, QueryOptions{}, nil, emptyLogFields)
+		defaultQuery, QueryOptions{},
+		NewQueryResults(nil, QueryResultsOptions{}, testOpts),
+		emptyLogFields)
 	require.Equal(t, randErr, err)
 }
 
@@ -497,7 +505,9 @@ func TestBlockMockQueryExecutorExecError(t *testing.T) {
 		exec.EXPECT().Close(),
 	)
 	_, err = b.Query(context.NewContext(), xresource.NewCancellableLifetime(),
-		defaultQuery, QueryOptions{}, nil, emptyLogFields)
+		defaultQuery, QueryOptions{},
+		NewQueryResults(nil, QueryResultsOptions{}, testOpts),
+		emptyLogFields)
 	require.Error(t, err)
 }
 
@@ -534,7 +544,8 @@ func TestBlockMockQueryExecutorExecIterErr(t *testing.T) {
 
 	_, err = b.Query(ctx, xresource.NewCancellableLifetime(),
 		defaultQuery, QueryOptions{},
-		NewQueryResults(nil, QueryResultsOptions{}, testOpts), emptyLogFields)
+		NewQueryResults(nil, QueryResultsOptions{}, testOpts),
+		emptyLogFields)
 	require.Error(t, err)
 
 	// NB(r): Make sure to call finalizers blockingly (to finish
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index b90de34146..3bedfa4d4d 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -22,6 +22,7 @@ package index
 
 import (
 	"errors"
+	"math"
 	"time"
 
 	"github.com/m3db/m3/src/m3ninx/generated/proto/querypb"
@@ -30,7 +31,6 @@ import (
 	"github.com/m3db/m3/src/m3ninx/search"
 	"github.com/m3db/m3/src/x/instrument"
 
-	"github.com/cespare/xxhash/v2"
 	"github.com/pborman/uuid"
 	"github.com/uber-go/tally"
 	"go.uber.org/zap"
@@ -100,7 +100,11 @@ func NewPostingsListCache(
 		return nil, nil, err
 	}
 
-	lru, err := newPostingsListLRU(size)
+	lru, err := newPostingsListLRU(postingsListLRUOptions{
+		size: size,
+		// Use ~1000 items per shard.
+		shards: int(math.Ceil(float64(size) / 1000)),
+	})
 	if err != nil {
 		return nil, nil, err
 	}
@@ -181,21 +185,6 @@ type cachedPostings struct {
 	searchQuery *querypb.Query
 }
 
-func keyHash(
-	segmentUUID uuid.UUID,
-	field string,
-	pattern string,
-	patternType PatternType,
-) uint64 {
-	var h xxhash.Digest
-	h.Reset()
-	_, _ = h.Write(segmentUUID)
-	_, _ = h.WriteString(field)
-	_, _ = h.WriteString(pattern)
-	_, _ = h.WriteString(string(patternType))
-	return h.Sum64()
-}
-
 // PutRegexp updates the LRU with the result of the regexp query.
 func (q *PostingsListCache) PutRegexp(
 	segmentUUID uuid.UUID,
@@ -345,16 +334,29 @@ func (q *PostingsListCache) CachedPatterns(
 ) CachedPatternsResult {
 	var result CachedPatternsResult
 
-	q.lru.RLock()
-	defer q.lru.RUnlock()
+	for _, shard := range q.lru.shards {
+		shard.RLock()
+		result = shardCachedPatternsWithRLock(uuid, query, fn, shard, result)
+		shard.RUnlock()
+	}
+
+	return result
+}
 
-	segmentPostings, ok := q.lru.items[uuid.Array()]
+func shardCachedPatternsWithRLock(
+	uuid uuid.UUID,
+	query CachedPatternsQuery,
+	fn CachedPatternForEachFn,
+	shard *postingsListLRUShard,
+	result CachedPatternsResult,
+) CachedPatternsResult {
+	segmentPostings, ok := shard.items[uuid.Array()]
 	if !ok {
 		return result
 	}
 
 	result.InRegistry = true
-	result.TotalPatterns = len(segmentPostings)
+	result.TotalPatterns += len(segmentPostings)
 	for key, value := range segmentPostings {
 		if v := query.PatternType; v != nil && *v != key.patternType {
 			continue
diff --git a/src/dbnode/storage/index/postings_list_cache_lru.go b/src/dbnode/storage/index/postings_list_cache_lru.go
index faa6e3c9bf..fb3f771036 100644
--- a/src/dbnode/storage/index/postings_list_cache_lru.go
+++ b/src/dbnode/storage/index/postings_list_cache_lru.go
@@ -23,8 +23,10 @@ package index
 import (
 	"container/list"
 	"errors"
+	"math"
 	"sync"
 
+	"github.com/cespare/xxhash/v2"
 	"github.com/pborman/uuid"
 )
 
@@ -58,6 +60,11 @@ import (
 // LRU. The specialization has the additional nice property that we don't need to allocate everytime
 // we add an item to the LRU due to the interface{} conversion.
 type postingsListLRU struct {
+	shards    []*postingsListLRUShard
+	numShards uint64
+}
+
+type postingsListLRUShard struct {
 	sync.RWMutex
 	size      int
 	evictList *list.List
@@ -77,26 +84,103 @@ type key struct {
 	patternType PatternType
 }
 
+type postingsListLRUOptions struct {
+	size   int
+	shards int
+}
+
 // newPostingsListLRU constructs an LRU of the given size.
-func newPostingsListLRU(size int) (*postingsListLRU, error) {
+func newPostingsListLRU(opts postingsListLRUOptions) (*postingsListLRU, error) {
+	size, shards := opts.size, opts.shards
 	if size <= 0 {
-		return nil, errors.New("Must provide a positive size")
+		return nil, errors.New("must provide a positive size")
+	}
+	if shards <= 0 {
+		return nil, errors.New("must provide a positive shards")
+	}
+
+	lruShards := make([]*postingsListLRUShard, 0, shards)
+	for i := 0; i < shards; i++ {
+		lruShard := newPostingsListLRUShard(int(math.Ceil(float64(size) / float64(shards))))
+		lruShards = append(lruShards, lruShard)
 	}
 
 	return &postingsListLRU{
+		shards:    lruShards,
+		numShards: uint64(len(lruShards)),
+	}, nil
+}
+
+// newPostingsListLRU constructs an LRU of the given size.
+func newPostingsListLRUShard(size int) *postingsListLRUShard {
+	return &postingsListLRUShard{
 		size:      size,
 		evictList: list.New(),
 		items:     make(map[uuid.Array]map[key]*list.Element),
-	}, nil
+	}
+}
+
+func (c *postingsListLRU) shard(
+	segmentUUID uuid.UUID,
+	field, pattern string,
+	patternType PatternType,
+) *postingsListLRUShard {
+	idx := hashKey(segmentUUID, field, pattern, patternType) % c.numShards
+	return c.shards[idx]
 }
 
-// Add adds a value to the cache. Returns true if an eviction occurred.
 func (c *postingsListLRU) Add(
 	segmentUUID uuid.UUID,
 	field string,
 	pattern string,
 	patternType PatternType,
 	cachedPostings *cachedPostings,
+) bool {
+	shard := c.shard(segmentUUID, field, pattern, patternType)
+	return shard.Add(segmentUUID, field, pattern, patternType, cachedPostings)
+}
+
+func (c *postingsListLRU) Get(
+	segmentUUID uuid.UUID,
+	field string,
+	pattern string,
+	patternType PatternType,
+) (*cachedPostings, bool) {
+	shard := c.shard(segmentUUID, field, pattern, patternType)
+	return shard.Get(segmentUUID, field, pattern, patternType)
+}
+
+func (c *postingsListLRU) Remove(
+	segmentUUID uuid.UUID,
+	field string,
+	pattern string,
+	patternType PatternType,
+) bool {
+	shard := c.shard(segmentUUID, field, pattern, patternType)
+	return shard.Remove(segmentUUID, field, pattern, patternType)
+}
+
+func (c *postingsListLRU) PurgeSegment(segmentUUID uuid.UUID) {
+	for _, shard := range c.shards {
+		shard.PurgeSegment(segmentUUID)
+	}
+}
+
+func (c *postingsListLRU) Len() int {
+	n := 0
+	for _, shard := range c.shards {
+		n += shard.Len()
+	}
+	return n
+}
+
+// Add adds a value to the cache. Returns true if an eviction occurred.
+func (c *postingsListLRUShard) Add(
+	segmentUUID uuid.UUID,
+	field string,
+	pattern string,
+	patternType PatternType,
+	cachedPostings *cachedPostings,
 ) (evicted bool) {
 	c.Lock()
 	defer c.Unlock()
@@ -142,7 +226,7 @@ func (c *postingsListLRU) Add(
 }
 
 // Get looks up a key's value from the cache.
-func (c *postingsListLRU) Get(
+func (c *postingsListLRUShard) Get(
 	segmentUUID uuid.UUID,
 	field string,
 	pattern string,
@@ -170,7 +254,7 @@ func (c *postingsListLRU) Get(
 
 // Remove removes the provided key from the cache, returning if the
 // key was contained.
-func (c *postingsListLRU) Remove(
+func (c *postingsListLRUShard) Remove(
 	segmentUUID uuid.UUID,
 	field string,
 	pattern string,
@@ -191,7 +275,7 @@ func (c *postingsListLRU) Remove(
 	return false
 }
 
-func (c *postingsListLRU) PurgeSegment(segmentUUID uuid.UUID) {
+func (c *postingsListLRUShard) PurgeSegment(segmentUUID uuid.UUID) {
 	c.Lock()
 	defer c.Unlock()
 
@@ -203,14 +287,14 @@ func (c *postingsListLRU) PurgeSegment(segmentUUID uuid.UUID) {
 }
 
 // Len returns the number of items in the cache.
-func (c *postingsListLRU) Len() int {
+func (c *postingsListLRUShard) Len() int {
 	c.RLock()
 	defer c.RUnlock()
 	return c.evictList.Len()
 }
 
 // removeOldest removes the oldest item from the cache.
-func (c *postingsListLRU) removeOldest() {
+func (c *postingsListLRUShard) removeOldest() {
 	ent := c.evictList.Back()
 	if ent != nil {
 		c.removeElement(ent)
@@ -218,7 +302,7 @@ func (c *postingsListLRU) removeOldest() {
 }
 
 // removeElement is used to remove a given list element from the cache
-func (c *postingsListLRU) removeElement(e *list.Element) {
+func (c *postingsListLRUShard) removeElement(e *list.Element) {
 	c.evictList.Remove(e)
 	entry := e.Value.(*entry)
 
@@ -233,3 +317,18 @@ func (c *postingsListLRU) removeElement(e *list.Element) {
 func newKey(field, pattern string, patternType PatternType) key {
 	return key{field: field, pattern: pattern, patternType: patternType}
 }
+
+func hashKey(
+	segmentUUID uuid.UUID,
+	field string,
+	pattern string,
+	patternType PatternType,
+) uint64 {
+	var h xxhash.Digest
+	h.Reset()
+	_, _ = h.Write(segmentUUID)
+	_, _ = h.WriteString(field)
+	_, _ = h.WriteString(pattern)
+	_, _ = h.WriteString(string(patternType))
+	return h.Sum64()
+}
diff --git a/src/dbnode/storage/index/postings_list_cache_lru_test.go b/src/dbnode/storage/index/postings_list_cache_lru_test.go
index 2fb384e767..94c4d88eff 100644
--- a/src/dbnode/storage/index/postings_list_cache_lru_test.go
+++ b/src/dbnode/storage/index/postings_list_cache_lru_test.go
@@ -23,6 +23,16 @@ package index
 // Keys returns a slice of the keys in the cache, from oldest to newest. Used for
 // testing only.
 func (c *postingsListLRU) keys() []key {
+	var keys []key
+	for _, shard := range c.shards {
+		keys = append(keys, shard.keys()...)
+	}
+	return keys
+}
+
+// Keys returns a slice of the keys in the cache, from oldest to newest. Used for
+// testing only.
+func (c *postingsListLRUShard) keys() []key {
 	keys := make([]key, 0, len(c.items))
 	for ent := c.evictList.Back(); ent != nil; ent = ent.Prev() {
 		entry := ent.Value.(*entry)
diff --git a/src/dbnode/storage/index/postings_list_cache_test.go b/src/dbnode/storage/index/postings_list_cache_test.go
index e4edd01a3f..8bfd2f2943 100644
--- a/src/dbnode/storage/index/postings_list_cache_test.go
+++ b/src/dbnode/storage/index/postings_list_cache_test.go
@@ -43,6 +43,7 @@ var (
 	// Filled in by init().
 	testPlEntries               []testEntry
 	testPostingListCacheOptions = PostingsListCacheOptions{
+		PostingsListPool:  postings.NewPool(nil, roaring.NewPostingsList),
 		InstrumentOptions: instrument.NewOptions(),
 	}
 )

From 5e035702633e82ab10e867619f151a4a89043d70 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Thu, 15 Apr 2021 15:31:08 -0400
Subject: [PATCH 103/106] Fix reconstructed searches not being executed

---
 src/dbnode/storage/index/block_test.go        |  34 ---
 src/dbnode/storage/index/mutable_segments.go  |   6 +-
 .../storage/index/mutable_segments_test.go    | 213 ++++++++++++++++--
 src/dbnode/storage/index/options.go           |   4 +-
 .../storage/index/postings_list_cache.go      |  22 +-
 .../storage/index/postings_list_cache_lru.go  |  25 +-
 .../index/postings_list_cache_lru_test.go     |   8 +-
 .../storage/index/postings_list_cache_test.go |  68 +++---
 .../storage/index/read_through_segment.go     |   6 +
 9 files changed, 267 insertions(+), 119 deletions(-)

diff --git a/src/dbnode/storage/index/block_test.go b/src/dbnode/storage/index/block_test.go
index ea57b415f4..db8e7b816e 100644
--- a/src/dbnode/storage/index/block_test.go
+++ b/src/dbnode/storage/index/block_test.go
@@ -2252,37 +2252,3 @@ func testDoc3() doc.Metadata {
 		},
 	}
 }
-
-func testDocN(n int) doc.Metadata {
-	return doc.Metadata{
-		ID: []byte(fmt.Sprintf("doc-%d", n)),
-		Fields: []doc.Field{
-			{
-				Name:  []byte("foo"),
-				Value: []byte("bar"),
-			},
-			{
-				Name: []byte("bucket-0"),
-				Value: moduloByteStr([]string{
-					"one",
-					"two",
-					"three",
-				}, n),
-			},
-			{
-				Name: []byte("bucket-1"),
-				Value: moduloByteStr([]string{
-					"one",
-					"two",
-					"three",
-					"four",
-					"five",
-				}, n),
-			},
-		},
-	}
-}
-
-func moduloByteStr(strs []string, n int) []byte {
-	return []byte(strs[n%len(strs)])
-}
diff --git a/src/dbnode/storage/index/mutable_segments.go b/src/dbnode/storage/index/mutable_segments.go
index 45abb7cba9..fe4bf35678 100644
--- a/src/dbnode/storage/index/mutable_segments.go
+++ b/src/dbnode/storage/index/mutable_segments.go
@@ -913,10 +913,10 @@ func (m *mutableSegments) populateCachedSearches(
 		})
 	}
 
-	searches := make(map[string]cachedPatternForCompactedSegment)
+	searches := make(map[PostingsListCacheKey]cachedPatternForCompactedSegment)
 	for i, seg := range prevSegs {
 		result := seg.segment.CachedSearchPatterns(func(p CachedPattern) {
-			pattern, ok := searches[p.SearchQueryKey]
+			pattern, ok := searches[p.CacheKey]
 			if !ok {
 				pattern = cachedPatternForCompactedSegment{
 					searchQuery: p.SearchQuery,
@@ -927,7 +927,7 @@ func (m *mutableSegments) populateCachedSearches(
 						prevSeg: prevSeg,
 					}
 				}
-				searches[p.SearchQueryKey] = pattern
+				searches[p.CacheKey] = pattern
 			}
 			// Mark this segment with the cached pattern.
 			pattern.patterns[i].hasCachedPattern = true
diff --git a/src/dbnode/storage/index/mutable_segments_test.go b/src/dbnode/storage/index/mutable_segments_test.go
index 6505087e05..0b9686b264 100644
--- a/src/dbnode/storage/index/mutable_segments_test.go
+++ b/src/dbnode/storage/index/mutable_segments_test.go
@@ -21,33 +21,79 @@
 package index
 
 import (
+	"fmt"
 	"testing"
 	"time"
 
+	"github.com/golang/mock/gomock"
+	"github.com/stretchr/testify/require"
+	"go.uber.org/zap"
+
 	"github.com/m3db/m3/src/dbnode/namespace"
+	"github.com/m3db/m3/src/m3ninx/doc"
+	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding/docs"
+	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
+	"github.com/m3db/m3/src/m3ninx/search"
+	"github.com/m3db/m3/src/m3ninx/search/query"
+	"github.com/m3db/m3/src/x/instrument"
+	"github.com/m3db/m3/src/x/pool"
 	xsync "github.com/m3db/m3/src/x/sync"
 	xtest "github.com/m3db/m3/src/x/test"
-
-	"github.com/stretchr/testify/require"
+	xtime "github.com/m3db/m3/src/x/time"
 )
 
+type testMutableSegmentsResult struct {
+	logger      *zap.Logger
+	cache       *PostingsListCache
+	searchCache *PostingsListCache
+}
+
 func newTestMutableSegments(
 	t *testing.T,
 	md namespace.Metadata,
 	blockStart time.Time,
-) *mutableSegments {
+) (*mutableSegments, testMutableSegmentsResult) {
 	cachedSearchesWorkers := xsync.NewWorkerPool(2)
 	cachedSearchesWorkers.Init()
 
-	segs, err := newMutableSegments(md, blockStart, testOpts, BlockOptions{},
-		cachedSearchesWorkers, namespace.NewRuntimeOptionsManager("foo"),
-		testOpts.InstrumentOptions())
+	iOpts := instrument.NewTestOptions(t)
+
+	poolOpts := pool.NewObjectPoolOptions().SetSize(0)
+	pool := postings.NewPool(poolOpts, roaring.NewPostingsList)
+
+	cache, _, err := NewPostingsListCache(10, PostingsListCacheOptions{
+		PostingsListPool:  pool,
+		InstrumentOptions: iOpts,
+	})
+	require.NoError(t, err)
+
+	searchCache, _, err := NewPostingsListCache(10, PostingsListCacheOptions{
+		PostingsListPool:  pool,
+		InstrumentOptions: iOpts,
+	})
+	require.NoError(t, err)
+
+	opts := testOpts.
+		SetPostingsListCache(cache).
+		SetSearchPostingsListCache(searchCache).
+		SetReadThroughSegmentOptions(ReadThroughSegmentOptions{
+			CacheRegexp:   true,
+			CacheTerms:    true,
+			CacheSearches: true,
+		})
+
+	segs, err := newMutableSegments(md, blockStart, opts, BlockOptions{},
+		cachedSearchesWorkers, namespace.NewRuntimeOptionsManager("foo"), iOpts)
 	require.NoError(t, err)
 
-	return segs
+	return segs, testMutableSegmentsResult{
+		logger:      iOpts.Logger(),
+		searchCache: searchCache,
+	}
 }
 
-func TestMutableSegmentsBackgroundCompact(t *testing.T) {
+func TestMutableSegmentsBackgroundCompactGCReconstructCachedSearches(t *testing.T) {
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
 
@@ -57,19 +103,152 @@ func TestMutableSegmentsBackgroundCompact(t *testing.T) {
 
 	nowNotBlockStartAligned := blockStart.Add(time.Minute)
 
-	segs := newTestMutableSegments(t, testMD, blockStart)
+	segs, result := newTestMutableSegments(t, testMD, blockStart)
 	segs.backgroundCompactDisable = true // Disable to explicitly test.
 
-	batch := NewWriteBatch(WriteBatchOptions{
-		IndexBlockSize: blockSize,
-	})
+	logger := result.logger.With(zap.String("test", t.Name()))
 
-	for i := 0; i < 32; i++ {
-		batch.Append(WriteBatchEntry{
-			Timestamp: nowNotBlockStartAligned,
-		}, testDocN(i))
+	// Insert until we have a background segment.
+	inserted := 0
+	for {
+		segs.Lock()
+		segsBackground := len(segs.backgroundSegments)
+		segs.Unlock()
+		if segsBackground > 0 {
+			break
+		}
+
+		batch := NewWriteBatch(WriteBatchOptions{
+			IndexBlockSize: blockSize,
+		})
+		for i := 0; i < 128; i++ {
+			stillIndexedBlockStartsAtGC := 1
+			if inserted%2 == 0 {
+				stillIndexedBlockStartsAtGC = 0
+			}
+			onIndexSeries := NewMockOnIndexSeries(ctrl)
+			onIndexSeries.EXPECT().
+				RelookupAndIncrementReaderWriterCount().
+				Return(onIndexSeries, true).
+				AnyTimes()
+			onIndexSeries.EXPECT().
+				RemoveIndexedForBlockStarts(gomock.Any()).
+				Return(RemoveIndexedForBlockStartsResult{
+					IndexedBlockStartsRemaining: stillIndexedBlockStartsAtGC,
+				}).
+				AnyTimes()
+			onIndexSeries.EXPECT().
+				DecrementReaderWriterCount().
+				AnyTimes()
+
+			batch.Append(WriteBatchEntry{
+				Timestamp:     nowNotBlockStartAligned,
+				OnIndexSeries: onIndexSeries,
+			}, testDocN(inserted))
+			inserted++
+		}
+
+		_, err := segs.WriteBatch(batch)
+		require.NoError(t, err)
 	}
 
-	_, err := segs.WriteBatch(batch)
+	// Perform some searches.
+	readers, err := segs.AddReaders(nil)
+	require.NoError(t, err)
+
+	b0, err := query.NewRegexpQuery([]byte("bucket-0"), []byte("(one|three)"))
 	require.NoError(t, err)
+
+	b1, err := query.NewRegexpQuery([]byte("bucket-0"), []byte("(one|three|five)"))
+	require.NoError(t, err)
+
+	q := query.NewConjunctionQuery([]search.Query{b0, b1})
+	searcher, err := q.Searcher()
+	require.NoError(t, err)
+
+	results := make(map[string]struct{})
+	for _, reader := range readers {
+		readThrough, ok := reader.(search.ReadThroughSegmentSearcher)
+		require.True(t, ok)
+
+		pl, err := readThrough.Search(q, searcher)
+		require.NoError(t, err)
+
+		it, err := reader.Docs(pl)
+		require.NoError(t, err)
+
+		for it.Next() {
+			d := it.Current()
+			id, err := docs.ReadIDFromDocument(d)
+			require.NoError(t, err)
+			results[string(id)] = struct{}{}
+		}
+
+		require.NoError(t, it.Err())
+		require.NoError(t, it.Close())
+	}
+
+	logger.Info("search results", zap.Int("results", len(results)))
+
+	// Make sure search postings cache was populated.
+	require.Equal(t, len(readers), result.searchCache.lru.Len())
+
+	// Explicitly background compact and make sure that background segment
+	// is GC'd of series no longer present.
+	segs.Lock()
+	segs.sealedBlockStarts[xtime.ToUnixNano(blockStart)] = struct{}{}
+	segs.backgroundCompactGCPending = true
+	segs.backgroundCompactWithLock()
+	compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
+	segs.Unlock()
+
+	// Should have kicked off a background compact GC.
+	require.True(t, compactingBackgroundGarbageCollect)
+
+	// Wait for background compact GC to run.
+	for {
+		segs.Lock()
+		compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
+		segs.Unlock()
+		if !compactingBackgroundGarbageCollect {
+			break
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+
+	// TODO: verify
+}
+
+func testDocN(n int) doc.Metadata {
+	return doc.Metadata{
+		ID: []byte(fmt.Sprintf("doc-%d", n)),
+		Fields: []doc.Field{
+			{
+				Name:  []byte("foo"),
+				Value: []byte("bar"),
+			},
+			{
+				Name: []byte("bucket-0"),
+				Value: moduloByteStr([]string{
+					"one",
+					"two",
+					"three",
+				}, n),
+			},
+			{
+				Name: []byte("bucket-1"),
+				Value: moduloByteStr([]string{
+					"one",
+					"two",
+					"three",
+					"four",
+					"five",
+				}, n),
+			},
+		},
+	}
+}
+
+func moduloByteStr(strs []string, n int) []byte {
+	return []byte(strs[n%len(strs)])
 }
diff --git a/src/dbnode/storage/index/options.go b/src/dbnode/storage/index/options.go
index bec989e628..74ba861b22 100644
--- a/src/dbnode/storage/index/options.go
+++ b/src/dbnode/storage/index/options.go
@@ -427,7 +427,7 @@ func (o *opts) SetPostingsListCache(value *PostingsListCache) Options {
 }
 
 func (o *opts) PostingsListCache() *PostingsListCache {
-	return o.searchPostingsListCache
+	return o.postingsListCache
 }
 
 func (o *opts) SetSearchPostingsListCache(value *PostingsListCache) Options {
@@ -437,7 +437,7 @@ func (o *opts) SetSearchPostingsListCache(value *PostingsListCache) Options {
 }
 
 func (o *opts) SearchPostingsListCache() *PostingsListCache {
-	return o.postingsListCache
+	return o.searchPostingsListCache
 }
 
 func (o *opts) SetReadThroughSegmentOptions(value ReadThroughSegmentOptions) Options {
diff --git a/src/dbnode/storage/index/postings_list_cache.go b/src/dbnode/storage/index/postings_list_cache.go
index 3bedfa4d4d..841dc89b9c 100644
--- a/src/dbnode/storage/index/postings_list_cache.go
+++ b/src/dbnode/storage/index/postings_list_cache.go
@@ -179,8 +179,6 @@ type cachedPostings struct {
 
 	// value
 	postings postings.List
-	// searchQueryKey is only set for search queries.
-	searchQueryKey string
 	// searchQuery is only set for search queries.
 	searchQuery *querypb.Query
 }
@@ -307,12 +305,9 @@ func (q *PostingsListCache) startLoop() Closer {
 }
 
 type CachedPattern struct {
-	Field          string
-	Pattern        string
-	PatternType    PatternType
-	SearchQueryKey string
-	SearchQuery    *querypb.Query
-	Postings       postings.List
+	CacheKey    PostingsListCacheKey
+	SearchQuery *querypb.Query
+	Postings    postings.List
 }
 
 type CachedPatternsResult struct {
@@ -358,17 +353,14 @@ func shardCachedPatternsWithRLock(
 	result.InRegistry = true
 	result.TotalPatterns += len(segmentPostings)
 	for key, value := range segmentPostings {
-		if v := query.PatternType; v != nil && *v != key.patternType {
+		if v := query.PatternType; v != nil && *v != key.PatternType {
 			continue
 		}
 
 		fn(CachedPattern{
-			Field:          key.field,
-			Pattern:        key.pattern,
-			PatternType:    key.patternType,
-			SearchQueryKey: value.Value.(*entry).cachedPostings.searchQueryKey,
-			SearchQuery:    value.Value.(*entry).cachedPostings.searchQuery,
-			Postings:       value.Value.(*entry).cachedPostings.postings,
+			CacheKey:    key,
+			SearchQuery: value.Value.(*entry).cachedPostings.searchQuery,
+			Postings:    value.Value.(*entry).cachedPostings.postings,
 		})
 		result.MatchedPatterns++
 	}
diff --git a/src/dbnode/storage/index/postings_list_cache_lru.go b/src/dbnode/storage/index/postings_list_cache_lru.go
index fb3f771036..488ec45879 100644
--- a/src/dbnode/storage/index/postings_list_cache_lru.go
+++ b/src/dbnode/storage/index/postings_list_cache_lru.go
@@ -68,20 +68,21 @@ type postingsListLRUShard struct {
 	sync.RWMutex
 	size      int
 	evictList *list.List
-	items     map[uuid.Array]map[key]*list.Element
+	items     map[uuid.Array]map[PostingsListCacheKey]*list.Element
 }
 
 // entry is used to hold a value in the evictList.
 type entry struct {
 	uuid           uuid.UUID
-	key            key
+	key            PostingsListCacheKey
 	cachedPostings *cachedPostings
 }
 
-type key struct {
-	field       string
-	pattern     string
-	patternType PatternType
+// PostingsListCacheKey is a postings list cache key.
+type PostingsListCacheKey struct {
+	Field       string
+	Pattern     string
+	PatternType PatternType
 }
 
 type postingsListLRUOptions struct {
@@ -116,7 +117,7 @@ func newPostingsListLRUShard(size int) *postingsListLRUShard {
 	return &postingsListLRUShard{
 		size:      size,
 		evictList: list.New(),
-		items:     make(map[uuid.Array]map[key]*list.Element),
+		items:     make(map[uuid.Array]map[PostingsListCacheKey]*list.Element),
 	}
 }
 
@@ -212,7 +213,7 @@ func (c *postingsListLRUShard) Add(
 	if queries, ok := c.items[uuidArray]; ok {
 		queries[newKey] = entry
 	} else {
-		c.items[uuidArray] = map[key]*list.Element{
+		c.items[uuidArray] = map[PostingsListCacheKey]*list.Element{
 			newKey: entry,
 		}
 	}
@@ -314,8 +315,12 @@ func (c *postingsListLRUShard) removeElement(e *list.Element) {
 	}
 }
 
-func newKey(field, pattern string, patternType PatternType) key {
-	return key{field: field, pattern: pattern, patternType: patternType}
+func newKey(field, pattern string, patternType PatternType) PostingsListCacheKey {
+	return PostingsListCacheKey{
+		Field:       field,
+		Pattern:     pattern,
+		PatternType: patternType,
+	}
 }
 
 func hashKey(
diff --git a/src/dbnode/storage/index/postings_list_cache_lru_test.go b/src/dbnode/storage/index/postings_list_cache_lru_test.go
index 94c4d88eff..29d92f50ac 100644
--- a/src/dbnode/storage/index/postings_list_cache_lru_test.go
+++ b/src/dbnode/storage/index/postings_list_cache_lru_test.go
@@ -22,8 +22,8 @@ package index
 
 // Keys returns a slice of the keys in the cache, from oldest to newest. Used for
 // testing only.
-func (c *postingsListLRU) keys() []key {
-	var keys []key
+func (c *postingsListLRU) keys() []PostingsListCacheKey {
+	var keys []PostingsListCacheKey
 	for _, shard := range c.shards {
 		keys = append(keys, shard.keys()...)
 	}
@@ -32,8 +32,8 @@ func (c *postingsListLRU) keys() []key {
 
 // Keys returns a slice of the keys in the cache, from oldest to newest. Used for
 // testing only.
-func (c *postingsListLRUShard) keys() []key {
-	keys := make([]key, 0, len(c.items))
+func (c *postingsListLRUShard) keys() []PostingsListCacheKey {
+	keys := make([]PostingsListCacheKey, 0, len(c.items))
 	for ent := c.evictList.Back(); ent != nil; ent = ent.Prev() {
 		entry := ent.Value.(*entry)
 		keys = append(keys, entry.key)
diff --git a/src/dbnode/storage/index/postings_list_cache_test.go b/src/dbnode/storage/index/postings_list_cache_test.go
index 8bfd2f2943..67463cf29f 100644
--- a/src/dbnode/storage/index/postings_list_cache_test.go
+++ b/src/dbnode/storage/index/postings_list_cache_test.go
@@ -80,7 +80,7 @@ func init() {
 
 type testEntry struct {
 	segmentUUID  uuid.UUID
-	key          key
+	key          PostingsListCacheKey
 	postingsList postings.List
 }
 
@@ -138,18 +138,18 @@ func TestPurgeSegment(t *testing.T) {
 
 	// Write many entries with the same segment UUID.
 	for i := 0; i < 100; i++ {
-		if testPlEntries[i].key.patternType == PatternTypeRegexp {
+		if testPlEntries[i].key.PatternType == PatternTypeRegexp {
 			plCache.PutRegexp(
 				testPlEntries[0].segmentUUID,
-				testPlEntries[i].key.field,
-				testPlEntries[i].key.pattern,
+				testPlEntries[i].key.Field,
+				testPlEntries[i].key.Pattern,
 				testPlEntries[i].postingsList,
 			)
 		} else {
 			plCache.PutTerm(
 				testPlEntries[0].segmentUUID,
-				testPlEntries[i].key.field,
-				testPlEntries[i].key.pattern,
+				testPlEntries[i].key.Field,
+				testPlEntries[i].key.Pattern,
 				testPlEntries[i].postingsList,
 			)
 		}
@@ -166,18 +166,18 @@ func TestPurgeSegment(t *testing.T) {
 	// All entries related to the purged segment should be gone.
 	require.Equal(t, size-100, plCache.lru.Len())
 	for i := 0; i < 100; i++ {
-		if testPlEntries[i].key.patternType == PatternTypeRegexp {
+		if testPlEntries[i].key.PatternType == PatternTypeRegexp {
 			_, ok := plCache.GetRegexp(
 				testPlEntries[0].segmentUUID,
-				testPlEntries[i].key.field,
-				testPlEntries[i].key.pattern,
+				testPlEntries[i].key.Field,
+				testPlEntries[i].key.Pattern,
 			)
 			require.False(t, ok)
 		} else {
 			_, ok := plCache.GetTerm(
 				testPlEntries[0].segmentUUID,
-				testPlEntries[i].key.field,
-				testPlEntries[i].key.pattern,
+				testPlEntries[i].key.Field,
+				testPlEntries[i].key.Pattern,
 			)
 			require.False(t, ok)
 		}
@@ -277,70 +277,70 @@ func testConcurrency(t *testing.T, size int, purge bool, verify bool) {
 func putEntry(t *testing.T, cache *PostingsListCache, i int) {
 	// Do each put twice to test the logic that avoids storing
 	// multiple entries for the same value.
-	switch testPlEntries[i].key.patternType {
+	switch testPlEntries[i].key.PatternType {
 	case PatternTypeRegexp:
 		cache.PutRegexp(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
+			testPlEntries[i].key.Field,
+			testPlEntries[i].key.Pattern,
 			testPlEntries[i].postingsList,
 		)
 		cache.PutRegexp(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
+			testPlEntries[i].key.Field,
+			testPlEntries[i].key.Pattern,
 			testPlEntries[i].postingsList,
 		)
 	case PatternTypeTerm:
 		cache.PutTerm(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
+			testPlEntries[i].key.Field,
+			testPlEntries[i].key.Pattern,
 			testPlEntries[i].postingsList,
 		)
 		cache.PutTerm(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
+			testPlEntries[i].key.Field,
+			testPlEntries[i].key.Pattern,
 			testPlEntries[i].postingsList,
 		)
 	case PatternTypeField:
 		cache.PutField(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
+			testPlEntries[i].key.Field,
 			testPlEntries[i].postingsList,
 		)
 		cache.PutField(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
+			testPlEntries[i].key.Field,
 			testPlEntries[i].postingsList,
 		)
 	default:
-		require.FailNow(t, "unknown pattern type", testPlEntries[i].key.patternType)
+		require.FailNow(t, "unknown pattern type", testPlEntries[i].key.PatternType)
 	}
 }
 
 func getEntry(t *testing.T, cache *PostingsListCache, i int) (postings.List, bool) {
-	switch testPlEntries[i].key.patternType {
+	switch testPlEntries[i].key.PatternType {
 	case PatternTypeRegexp:
 		return cache.GetRegexp(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
+			testPlEntries[i].key.Field,
+			testPlEntries[i].key.Pattern,
 		)
 	case PatternTypeTerm:
 		return cache.GetTerm(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
-			testPlEntries[i].key.pattern,
+			testPlEntries[i].key.Field,
+			testPlEntries[i].key.Pattern,
 		)
 	case PatternTypeField:
 		return cache.GetField(
 			testPlEntries[i].segmentUUID,
-			testPlEntries[i].key.field,
+			testPlEntries[i].key.Field,
 		)
 	default:
-		require.FailNow(t, "unknown pattern type", testPlEntries[i].key.patternType)
+		require.FailNow(t, "unknown pattern type", testPlEntries[i].key.PatternType)
 	}
 	return nil, false
 }
@@ -354,14 +354,14 @@ func requireExpectedOrder(t *testing.T, plCache *PostingsListCache, expectedOrde
 func printSortedKeys(t *testing.T, cache *PostingsListCache) {
 	keys := cache.lru.keys()
 	sort.Slice(keys, func(i, j int) bool {
-		iIdx, err := strconv.ParseInt(keys[i].field, 10, 64)
+		iIdx, err := strconv.ParseInt(keys[i].Field, 10, 64)
 		if err != nil {
-			t.Fatalf("unable to parse: %s into int", keys[i].field)
+			t.Fatalf("unable to parse: %s into int", keys[i].Field)
 		}
 
-		jIdx, err := strconv.ParseInt(keys[j].field, 10, 64)
+		jIdx, err := strconv.ParseInt(keys[j].Field, 10, 64)
 		if err != nil {
-			t.Fatalf("unable to parse: %s into int", keys[i].field)
+			t.Fatalf("unable to parse: %s into int", keys[i].Field)
 		}
 
 		return iIdx < jIdx
diff --git a/src/dbnode/storage/index/read_through_segment.go b/src/dbnode/storage/index/read_through_segment.go
index e8b81bbbd9..0ae5404590 100644
--- a/src/dbnode/storage/index/read_through_segment.go
+++ b/src/dbnode/storage/index/read_through_segment.go
@@ -179,6 +179,12 @@ func (r *ReadThroughSegment) PutCachedSearchPattern(
 	query search.Query,
 	pl postings.List,
 ) {
+	r.RLock()
+	defer r.RUnlock()
+	if r.closed {
+		return
+	}
+
 	cache := r.caches.SearchPostingsListCache
 	if cache == nil || !r.opts.CacheSearches {
 		return

From 734d67faefa9a6bfc124c4d76ee5c2d8c50be21f Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 16 Apr 2021 08:19:35 -0400
Subject: [PATCH 104/106] Do not pool multi-bitmap iterators

---
 .../storage/index/mutable_segments_test.go    | 279 ++++++++++--------
 .../postings/roaring/bitmap_multi_readonly.go |   7 +-
 2 files changed, 169 insertions(+), 117 deletions(-)

diff --git a/src/dbnode/storage/index/mutable_segments_test.go b/src/dbnode/storage/index/mutable_segments_test.go
index 0b9686b264..376245644a 100644
--- a/src/dbnode/storage/index/mutable_segments_test.go
+++ b/src/dbnode/storage/index/mutable_segments_test.go
@@ -1,3 +1,5 @@
+// +build big
+
 // Copyright (c) 2021 Uber Technologies, Inc.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -26,12 +28,13 @@ import (
 	"time"
 
 	"github.com/golang/mock/gomock"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"go.uber.org/zap"
 
 	"github.com/m3db/m3/src/dbnode/namespace"
 	"github.com/m3db/m3/src/m3ninx/doc"
-	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding/docs"
+	"github.com/m3db/m3/src/m3ninx/index"
 	"github.com/m3db/m3/src/m3ninx/postings"
 	"github.com/m3db/m3/src/m3ninx/postings/roaring"
 	"github.com/m3db/m3/src/m3ninx/search"
@@ -94,6 +97,11 @@ func newTestMutableSegments(
 }
 
 func TestMutableSegmentsBackgroundCompactGCReconstructCachedSearches(t *testing.T) {
+	// Use read only postings.
+	prevReadOnlyPostings := index.MigrationReadOnlyPostings()
+	index.SetMigrationReadOnlyPostings(true)
+	defer index.SetMigrationReadOnlyPostings(prevReadOnlyPostings)
+
 	ctrl := xtest.NewController(t)
 	defer ctrl.Finish()
 
@@ -106,119 +114,168 @@ func TestMutableSegmentsBackgroundCompactGCReconstructCachedSearches(t *testing.
 	segs, result := newTestMutableSegments(t, testMD, blockStart)
 	segs.backgroundCompactDisable = true // Disable to explicitly test.
 
-	logger := result.logger.With(zap.String("test", t.Name()))
-
-	// Insert until we have a background segment.
 	inserted := 0
-	for {
-		segs.Lock()
-		segsBackground := len(segs.backgroundSegments)
-		segs.Unlock()
-		if segsBackground > 0 {
-			break
-		}
+	segs.Lock()
+	segsBackground := len(segs.backgroundSegments)
+	segs.Unlock()
 
-		batch := NewWriteBatch(WriteBatchOptions{
-			IndexBlockSize: blockSize,
-		})
-		for i := 0; i < 128; i++ {
-			stillIndexedBlockStartsAtGC := 1
-			if inserted%2 == 0 {
-				stillIndexedBlockStartsAtGC = 0
+	for runs := 0; runs < 10; runs++ {
+		t.Run(fmt.Sprintf("run-%d", runs), func(t *testing.T) {
+			logger := result.logger.With(zap.Int("run", runs))
+
+			// Insert until we have a new background segment.
+			for {
+				segs.Lock()
+				curr := len(segs.backgroundSegments)
+				segs.Unlock()
+				if curr > segsBackground {
+					segsBackground = curr
+					break
+				}
+
+				batch := NewWriteBatch(WriteBatchOptions{
+					IndexBlockSize: blockSize,
+				})
+				for i := 0; i < 128; i++ {
+					stillIndexedBlockStartsAtGC := 1
+					if inserted%2 == 0 {
+						stillIndexedBlockStartsAtGC = 0
+					}
+					onIndexSeries := NewMockOnIndexSeries(ctrl)
+					onIndexSeries.EXPECT().
+						RelookupAndIncrementReaderWriterCount().
+						Return(onIndexSeries, true).
+						AnyTimes()
+					onIndexSeries.EXPECT().
+						RemoveIndexedForBlockStarts(gomock.Any()).
+						Return(RemoveIndexedForBlockStartsResult{
+							IndexedBlockStartsRemaining: stillIndexedBlockStartsAtGC,
+						}).
+						AnyTimes()
+					onIndexSeries.EXPECT().
+						DecrementReaderWriterCount().
+						AnyTimes()
+
+					batch.Append(WriteBatchEntry{
+						Timestamp:     nowNotBlockStartAligned,
+						OnIndexSeries: onIndexSeries,
+					}, testDocN(inserted))
+					inserted++
+				}
+
+				_, err := segs.WriteBatch(batch)
+				require.NoError(t, err)
 			}
-			onIndexSeries := NewMockOnIndexSeries(ctrl)
-			onIndexSeries.EXPECT().
-				RelookupAndIncrementReaderWriterCount().
-				Return(onIndexSeries, true).
-				AnyTimes()
-			onIndexSeries.EXPECT().
-				RemoveIndexedForBlockStarts(gomock.Any()).
-				Return(RemoveIndexedForBlockStartsResult{
-					IndexedBlockStartsRemaining: stillIndexedBlockStartsAtGC,
-				}).
-				AnyTimes()
-			onIndexSeries.EXPECT().
-				DecrementReaderWriterCount().
-				AnyTimes()
-
-			batch.Append(WriteBatchEntry{
-				Timestamp:     nowNotBlockStartAligned,
-				OnIndexSeries: onIndexSeries,
-			}, testDocN(inserted))
-			inserted++
-		}
-
-		_, err := segs.WriteBatch(batch)
-		require.NoError(t, err)
-	}
-
-	// Perform some searches.
-	readers, err := segs.AddReaders(nil)
-	require.NoError(t, err)
-
-	b0, err := query.NewRegexpQuery([]byte("bucket-0"), []byte("(one|three)"))
-	require.NoError(t, err)
-
-	b1, err := query.NewRegexpQuery([]byte("bucket-0"), []byte("(one|three|five)"))
-	require.NoError(t, err)
-
-	q := query.NewConjunctionQuery([]search.Query{b0, b1})
-	searcher, err := q.Searcher()
-	require.NoError(t, err)
-
-	results := make(map[string]struct{})
-	for _, reader := range readers {
-		readThrough, ok := reader.(search.ReadThroughSegmentSearcher)
-		require.True(t, ok)
 
-		pl, err := readThrough.Search(q, searcher)
-		require.NoError(t, err)
-
-		it, err := reader.Docs(pl)
-		require.NoError(t, err)
+			// Perform some searches.
+			testDocSearches(t, segs)
+
+			// Make sure search postings cache was populated.
+			require.True(t, result.searchCache.lru.Len() > 0)
+			logger.Info("search cache populated", zap.Int("n", result.searchCache.lru.Len()))
+
+			// Start some async searches so we have searches going on while
+			// executing background compact GC.
+			doneCh := make(chan struct{}, 2)
+			defer close(doneCh)
+			for i := 0; i < 2; i++ {
+				go func() {
+					for {
+						select {
+						case <-doneCh:
+							return
+						default:
+						}
+						// Search continously.
+						testDocSearches(t, segs)
+					}
+				}()
+			}
 
-		for it.Next() {
-			d := it.Current()
-			id, err := docs.ReadIDFromDocument(d)
-			require.NoError(t, err)
-			results[string(id)] = struct{}{}
-		}
+			// Explicitly background compact and make sure that background segment
+			// is GC'd of series no longer present.
+			segs.Lock()
+			segs.sealedBlockStarts[xtime.ToUnixNano(blockStart)] = struct{}{}
+			segs.backgroundCompactGCPending = true
+			segs.backgroundCompactWithLock()
+			compactingBackgroundStandard := segs.compact.compactingBackgroundStandard
+			compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
+			segs.Unlock()
+
+			// Should have kicked off a background compact GC.
+			require.True(t, compactingBackgroundStandard || compactingBackgroundGarbageCollect)
+
+			// Wait for background compact GC to run.
+			for {
+				segs.Lock()
+				compactingBackgroundStandard := segs.compact.compactingBackgroundStandard
+				compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
+				segs.Unlock()
+				if !compactingBackgroundStandard && !compactingBackgroundGarbageCollect {
+					break
+				}
+				time.Sleep(100 * time.Millisecond)
+			}
 
-		require.NoError(t, it.Err())
-		require.NoError(t, it.Close())
+			logger.Info("compaction done, search cache", zap.Int("n", result.searchCache.lru.Len()))
+		})
 	}
+}
 
-	logger.Info("search results", zap.Int("results", len(results)))
-
-	// Make sure search postings cache was populated.
-	require.Equal(t, len(readers), result.searchCache.lru.Len())
-
-	// Explicitly background compact and make sure that background segment
-	// is GC'd of series no longer present.
-	segs.Lock()
-	segs.sealedBlockStarts[xtime.ToUnixNano(blockStart)] = struct{}{}
-	segs.backgroundCompactGCPending = true
-	segs.backgroundCompactWithLock()
-	compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
-	segs.Unlock()
-
-	// Should have kicked off a background compact GC.
-	require.True(t, compactingBackgroundGarbageCollect)
-
-	// Wait for background compact GC to run.
-	for {
-		segs.Lock()
-		compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
-		segs.Unlock()
-		if !compactingBackgroundGarbageCollect {
-			break
+func testDocSearches(
+	t *testing.T,
+	segs *mutableSegments,
+) {
+	for i := 0; i < len(testDocBucket0Values); i++ {
+		for j := 0; j < len(testDocBucket1Values); j++ {
+			readers, err := segs.AddReaders(nil)
+			assert.NoError(t, err)
+
+			regexp0 := fmt.Sprintf("(%s|%s)", moduloByteStr(testDocBucket0Values, i),
+				moduloByteStr(testDocBucket0Values, i+1))
+			b0, err := query.NewRegexpQuery([]byte(testDocBucket0Name), []byte(regexp0))
+			assert.NoError(t, err)
+
+			regexp1 := fmt.Sprintf("(%s|%s|%s)", moduloByteStr(testDocBucket1Values, j),
+				moduloByteStr(testDocBucket1Values, j+1),
+				moduloByteStr(testDocBucket1Values, j+2))
+			b1, err := query.NewRegexpQuery([]byte(testDocBucket1Name), []byte(regexp1))
+			assert.NoError(t, err)
+
+			q := query.NewConjunctionQuery([]search.Query{b0, b1})
+			searcher, err := q.Searcher()
+			assert.NoError(t, err)
+
+			for _, reader := range readers {
+				readThrough, ok := reader.(search.ReadThroughSegmentSearcher)
+				assert.True(t, ok)
+
+				pl, err := readThrough.Search(q, searcher)
+				assert.NoError(t, err)
+
+				assert.True(t, pl.CountSlow() > 0)
+			}
 		}
-		time.Sleep(100 * time.Millisecond)
 	}
-
-	// TODO: verify
 }
 
+var (
+	testDocBucket0Name   = "bucket_0"
+	testDocBucket0Values = []string{
+		"one",
+		"two",
+		"three",
+	}
+	testDocBucket1Name   = "bucket_1"
+	testDocBucket1Values = []string{
+		"one",
+		"two",
+		"three",
+		"four",
+		"five",
+	}
+)
+
 func testDocN(n int) doc.Metadata {
 	return doc.Metadata{
 		ID: []byte(fmt.Sprintf("doc-%d", n)),
@@ -228,22 +285,12 @@ func testDocN(n int) doc.Metadata {
 				Value: []byte("bar"),
 			},
 			{
-				Name: []byte("bucket-0"),
-				Value: moduloByteStr([]string{
-					"one",
-					"two",
-					"three",
-				}, n),
+				Name:  []byte(testDocBucket0Name),
+				Value: moduloByteStr(testDocBucket0Values, n),
 			},
 			{
-				Name: []byte("bucket-1"),
-				Value: moduloByteStr([]string{
-					"one",
-					"two",
-					"three",
-					"four",
-					"five",
-				}, n),
+				Name:  []byte(testDocBucket1Name),
+				Value: moduloByteStr(testDocBucket1Values, n),
 			},
 		},
 	}
diff --git a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
index d2cd79318c..4d2dd1f858 100644
--- a/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
+++ b/src/m3ninx/postings/roaring/bitmap_multi_readonly.go
@@ -480,7 +480,12 @@ func (i *multiBitmapIterator) Close() error {
 	i.Reset(multiBitmapOptions{})
 
 	// Return this ref to the pool for re-use.
-	putMultiBitmapIterator(i)
+	// TODO:!!!!!
+	// TODO: Investigate why pooling this causes bad
+	// reuse and potentially put ref tracking on multi-bitmap
+	// iterators.
+	// TODO:!!!!!
+	// putMultiBitmapIterator(i)
 
 	return nil
 }

From 43c0026c41ec24af607f702e440ac29511666be7 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Fri, 16 Apr 2021 10:16:44 -0400
Subject: [PATCH 105/106] Add test for populate worker

---
 .../index/mutable_segments_big_test.go        | 301 ++++++++++++++++++
 .../storage/index/mutable_segments_test.go    | 289 ++---------------
 2 files changed, 325 insertions(+), 265 deletions(-)
 create mode 100644 src/dbnode/storage/index/mutable_segments_big_test.go

diff --git a/src/dbnode/storage/index/mutable_segments_big_test.go b/src/dbnode/storage/index/mutable_segments_big_test.go
new file mode 100644
index 0000000000..376245644a
--- /dev/null
+++ b/src/dbnode/storage/index/mutable_segments_big_test.go
@@ -0,0 +1,301 @@
+// +build big
+
+// Copyright (c) 2021 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package index
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/golang/mock/gomock"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"go.uber.org/zap"
+
+	"github.com/m3db/m3/src/dbnode/namespace"
+	"github.com/m3db/m3/src/m3ninx/doc"
+	"github.com/m3db/m3/src/m3ninx/index"
+	"github.com/m3db/m3/src/m3ninx/postings"
+	"github.com/m3db/m3/src/m3ninx/postings/roaring"
+	"github.com/m3db/m3/src/m3ninx/search"
+	"github.com/m3db/m3/src/m3ninx/search/query"
+	"github.com/m3db/m3/src/x/instrument"
+	"github.com/m3db/m3/src/x/pool"
+	xsync "github.com/m3db/m3/src/x/sync"
+	xtest "github.com/m3db/m3/src/x/test"
+	xtime "github.com/m3db/m3/src/x/time"
+)
+
+type testMutableSegmentsResult struct {
+	logger      *zap.Logger
+	cache       *PostingsListCache
+	searchCache *PostingsListCache
+}
+
+func newTestMutableSegments(
+	t *testing.T,
+	md namespace.Metadata,
+	blockStart time.Time,
+) (*mutableSegments, testMutableSegmentsResult) {
+	cachedSearchesWorkers := xsync.NewWorkerPool(2)
+	cachedSearchesWorkers.Init()
+
+	iOpts := instrument.NewTestOptions(t)
+
+	poolOpts := pool.NewObjectPoolOptions().SetSize(0)
+	pool := postings.NewPool(poolOpts, roaring.NewPostingsList)
+
+	cache, _, err := NewPostingsListCache(10, PostingsListCacheOptions{
+		PostingsListPool:  pool,
+		InstrumentOptions: iOpts,
+	})
+	require.NoError(t, err)
+
+	searchCache, _, err := NewPostingsListCache(10, PostingsListCacheOptions{
+		PostingsListPool:  pool,
+		InstrumentOptions: iOpts,
+	})
+	require.NoError(t, err)
+
+	opts := testOpts.
+		SetPostingsListCache(cache).
+		SetSearchPostingsListCache(searchCache).
+		SetReadThroughSegmentOptions(ReadThroughSegmentOptions{
+			CacheRegexp:   true,
+			CacheTerms:    true,
+			CacheSearches: true,
+		})
+
+	segs, err := newMutableSegments(md, blockStart, opts, BlockOptions{},
+		cachedSearchesWorkers, namespace.NewRuntimeOptionsManager("foo"), iOpts)
+	require.NoError(t, err)
+
+	return segs, testMutableSegmentsResult{
+		logger:      iOpts.Logger(),
+		searchCache: searchCache,
+	}
+}
+
+func TestMutableSegmentsBackgroundCompactGCReconstructCachedSearches(t *testing.T) {
+	// Use read only postings.
+	prevReadOnlyPostings := index.MigrationReadOnlyPostings()
+	index.SetMigrationReadOnlyPostings(true)
+	defer index.SetMigrationReadOnlyPostings(prevReadOnlyPostings)
+
+	ctrl := xtest.NewController(t)
+	defer ctrl.Finish()
+
+	blockSize := time.Hour
+	testMD := newTestNSMetadata(t)
+	blockStart := time.Now().Truncate(blockSize)
+
+	nowNotBlockStartAligned := blockStart.Add(time.Minute)
+
+	segs, result := newTestMutableSegments(t, testMD, blockStart)
+	segs.backgroundCompactDisable = true // Disable to explicitly test.
+
+	inserted := 0
+	segs.Lock()
+	segsBackground := len(segs.backgroundSegments)
+	segs.Unlock()
+
+	for runs := 0; runs < 10; runs++ {
+		t.Run(fmt.Sprintf("run-%d", runs), func(t *testing.T) {
+			logger := result.logger.With(zap.Int("run", runs))
+
+			// Insert until we have a new background segment.
+			for {
+				segs.Lock()
+				curr := len(segs.backgroundSegments)
+				segs.Unlock()
+				if curr > segsBackground {
+					segsBackground = curr
+					break
+				}
+
+				batch := NewWriteBatch(WriteBatchOptions{
+					IndexBlockSize: blockSize,
+				})
+				for i := 0; i < 128; i++ {
+					stillIndexedBlockStartsAtGC := 1
+					if inserted%2 == 0 {
+						stillIndexedBlockStartsAtGC = 0
+					}
+					onIndexSeries := NewMockOnIndexSeries(ctrl)
+					onIndexSeries.EXPECT().
+						RelookupAndIncrementReaderWriterCount().
+						Return(onIndexSeries, true).
+						AnyTimes()
+					onIndexSeries.EXPECT().
+						RemoveIndexedForBlockStarts(gomock.Any()).
+						Return(RemoveIndexedForBlockStartsResult{
+							IndexedBlockStartsRemaining: stillIndexedBlockStartsAtGC,
+						}).
+						AnyTimes()
+					onIndexSeries.EXPECT().
+						DecrementReaderWriterCount().
+						AnyTimes()
+
+					batch.Append(WriteBatchEntry{
+						Timestamp:     nowNotBlockStartAligned,
+						OnIndexSeries: onIndexSeries,
+					}, testDocN(inserted))
+					inserted++
+				}
+
+				_, err := segs.WriteBatch(batch)
+				require.NoError(t, err)
+			}
+
+			// Perform some searches.
+			testDocSearches(t, segs)
+
+			// Make sure search postings cache was populated.
+			require.True(t, result.searchCache.lru.Len() > 0)
+			logger.Info("search cache populated", zap.Int("n", result.searchCache.lru.Len()))
+
+			// Start some async searches so we have searches going on while
+			// executing background compact GC.
+			doneCh := make(chan struct{}, 2)
+			defer close(doneCh)
+			for i := 0; i < 2; i++ {
+				go func() {
+					for {
+						select {
+						case <-doneCh:
+							return
+						default:
+						}
+						// Search continously.
+						testDocSearches(t, segs)
+					}
+				}()
+			}
+
+			// Explicitly background compact and make sure that background segment
+			// is GC'd of series no longer present.
+			segs.Lock()
+			segs.sealedBlockStarts[xtime.ToUnixNano(blockStart)] = struct{}{}
+			segs.backgroundCompactGCPending = true
+			segs.backgroundCompactWithLock()
+			compactingBackgroundStandard := segs.compact.compactingBackgroundStandard
+			compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
+			segs.Unlock()
+
+			// Should have kicked off a background compact GC.
+			require.True(t, compactingBackgroundStandard || compactingBackgroundGarbageCollect)
+
+			// Wait for background compact GC to run.
+			for {
+				segs.Lock()
+				compactingBackgroundStandard := segs.compact.compactingBackgroundStandard
+				compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
+				segs.Unlock()
+				if !compactingBackgroundStandard && !compactingBackgroundGarbageCollect {
+					break
+				}
+				time.Sleep(100 * time.Millisecond)
+			}
+
+			logger.Info("compaction done, search cache", zap.Int("n", result.searchCache.lru.Len()))
+		})
+	}
+}
+
+func testDocSearches(
+	t *testing.T,
+	segs *mutableSegments,
+) {
+	for i := 0; i < len(testDocBucket0Values); i++ {
+		for j := 0; j < len(testDocBucket1Values); j++ {
+			readers, err := segs.AddReaders(nil)
+			assert.NoError(t, err)
+
+			regexp0 := fmt.Sprintf("(%s|%s)", moduloByteStr(testDocBucket0Values, i),
+				moduloByteStr(testDocBucket0Values, i+1))
+			b0, err := query.NewRegexpQuery([]byte(testDocBucket0Name), []byte(regexp0))
+			assert.NoError(t, err)
+
+			regexp1 := fmt.Sprintf("(%s|%s|%s)", moduloByteStr(testDocBucket1Values, j),
+				moduloByteStr(testDocBucket1Values, j+1),
+				moduloByteStr(testDocBucket1Values, j+2))
+			b1, err := query.NewRegexpQuery([]byte(testDocBucket1Name), []byte(regexp1))
+			assert.NoError(t, err)
+
+			q := query.NewConjunctionQuery([]search.Query{b0, b1})
+			searcher, err := q.Searcher()
+			assert.NoError(t, err)
+
+			for _, reader := range readers {
+				readThrough, ok := reader.(search.ReadThroughSegmentSearcher)
+				assert.True(t, ok)
+
+				pl, err := readThrough.Search(q, searcher)
+				assert.NoError(t, err)
+
+				assert.True(t, pl.CountSlow() > 0)
+			}
+		}
+	}
+}
+
+var (
+	testDocBucket0Name   = "bucket_0"
+	testDocBucket0Values = []string{
+		"one",
+		"two",
+		"three",
+	}
+	testDocBucket1Name   = "bucket_1"
+	testDocBucket1Values = []string{
+		"one",
+		"two",
+		"three",
+		"four",
+		"five",
+	}
+)
+
+func testDocN(n int) doc.Metadata {
+	return doc.Metadata{
+		ID: []byte(fmt.Sprintf("doc-%d", n)),
+		Fields: []doc.Field{
+			{
+				Name:  []byte("foo"),
+				Value: []byte("bar"),
+			},
+			{
+				Name:  []byte(testDocBucket0Name),
+				Value: moduloByteStr(testDocBucket0Values, n),
+			},
+			{
+				Name:  []byte(testDocBucket1Name),
+				Value: moduloByteStr(testDocBucket1Values, n),
+			},
+		},
+	}
+}
+
+func moduloByteStr(strs []string, n int) []byte {
+	return []byte(strs[n%len(strs)])
+}
diff --git a/src/dbnode/storage/index/mutable_segments_test.go b/src/dbnode/storage/index/mutable_segments_test.go
index 376245644a..9a9c54e7e6 100644
--- a/src/dbnode/storage/index/mutable_segments_test.go
+++ b/src/dbnode/storage/index/mutable_segments_test.go
@@ -1,5 +1,3 @@
-// +build big
-
 // Copyright (c) 2021 Uber Technologies, Inc.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -23,279 +21,40 @@
 package index
 
 import (
-	"fmt"
+	"io"
+	"math/rand"
 	"testing"
-	"time"
 
-	"github.com/golang/mock/gomock"
-	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	"go.uber.org/zap"
-
-	"github.com/m3db/m3/src/dbnode/namespace"
-	"github.com/m3db/m3/src/m3ninx/doc"
-	"github.com/m3db/m3/src/m3ninx/index"
-	"github.com/m3db/m3/src/m3ninx/postings"
-	"github.com/m3db/m3/src/m3ninx/postings/roaring"
-	"github.com/m3db/m3/src/m3ninx/search"
-	"github.com/m3db/m3/src/m3ninx/search/query"
-	"github.com/m3db/m3/src/x/instrument"
-	"github.com/m3db/m3/src/x/pool"
-	xsync "github.com/m3db/m3/src/x/sync"
-	xtest "github.com/m3db/m3/src/x/test"
-	xtime "github.com/m3db/m3/src/x/time"
 )
 
-type testMutableSegmentsResult struct {
-	logger      *zap.Logger
-	cache       *PostingsListCache
-	searchCache *PostingsListCache
-}
-
-func newTestMutableSegments(
-	t *testing.T,
-	md namespace.Metadata,
-	blockStart time.Time,
-) (*mutableSegments, testMutableSegmentsResult) {
-	cachedSearchesWorkers := xsync.NewWorkerPool(2)
-	cachedSearchesWorkers.Init()
-
-	iOpts := instrument.NewTestOptions(t)
-
-	poolOpts := pool.NewObjectPoolOptions().SetSize(0)
-	pool := postings.NewPool(poolOpts, roaring.NewPostingsList)
-
-	cache, _, err := NewPostingsListCache(10, PostingsListCacheOptions{
-		PostingsListPool:  pool,
-		InstrumentOptions: iOpts,
-	})
-	require.NoError(t, err)
-
-	searchCache, _, err := NewPostingsListCache(10, PostingsListCacheOptions{
-		PostingsListPool:  pool,
-		InstrumentOptions: iOpts,
-	})
-	require.NoError(t, err)
-
-	opts := testOpts.
-		SetPostingsListCache(cache).
-		SetSearchPostingsListCache(searchCache).
-		SetReadThroughSegmentOptions(ReadThroughSegmentOptions{
-			CacheRegexp:   true,
-			CacheTerms:    true,
-			CacheSearches: true,
-		})
-
-	segs, err := newMutableSegments(md, blockStart, opts, BlockOptions{},
-		cachedSearchesWorkers, namespace.NewRuntimeOptionsManager("foo"), iOpts)
-	require.NoError(t, err)
-
-	return segs, testMutableSegmentsResult{
-		logger:      iOpts.Logger(),
-		searchCache: searchCache,
-	}
-}
-
-func TestMutableSegmentsBackgroundCompactGCReconstructCachedSearches(t *testing.T) {
-	// Use read only postings.
-	prevReadOnlyPostings := index.MigrationReadOnlyPostings()
-	index.SetMigrationReadOnlyPostings(true)
-	defer index.SetMigrationReadOnlyPostings(prevReadOnlyPostings)
-
-	ctrl := xtest.NewController(t)
-	defer ctrl.Finish()
-
-	blockSize := time.Hour
-	testMD := newTestNSMetadata(t)
-	blockStart := time.Now().Truncate(blockSize)
-
-	nowNotBlockStartAligned := blockStart.Add(time.Minute)
-
-	segs, result := newTestMutableSegments(t, testMD, blockStart)
-	segs.backgroundCompactDisable = true // Disable to explicitly test.
-
-	inserted := 0
-	segs.Lock()
-	segsBackground := len(segs.backgroundSegments)
-	segs.Unlock()
-
-	for runs := 0; runs < 10; runs++ {
-		t.Run(fmt.Sprintf("run-%d", runs), func(t *testing.T) {
-			logger := result.logger.With(zap.Int("run", runs))
-
-			// Insert until we have a new background segment.
-			for {
-				segs.Lock()
-				curr := len(segs.backgroundSegments)
-				segs.Unlock()
-				if curr > segsBackground {
-					segsBackground = curr
-					break
-				}
-
-				batch := NewWriteBatch(WriteBatchOptions{
-					IndexBlockSize: blockSize,
-				})
-				for i := 0; i < 128; i++ {
-					stillIndexedBlockStartsAtGC := 1
-					if inserted%2 == 0 {
-						stillIndexedBlockStartsAtGC = 0
-					}
-					onIndexSeries := NewMockOnIndexSeries(ctrl)
-					onIndexSeries.EXPECT().
-						RelookupAndIncrementReaderWriterCount().
-						Return(onIndexSeries, true).
-						AnyTimes()
-					onIndexSeries.EXPECT().
-						RemoveIndexedForBlockStarts(gomock.Any()).
-						Return(RemoveIndexedForBlockStartsResult{
-							IndexedBlockStartsRemaining: stillIndexedBlockStartsAtGC,
-						}).
-						AnyTimes()
-					onIndexSeries.EXPECT().
-						DecrementReaderWriterCount().
-						AnyTimes()
-
-					batch.Append(WriteBatchEntry{
-						Timestamp:     nowNotBlockStartAligned,
-						OnIndexSeries: onIndexSeries,
-					}, testDocN(inserted))
-					inserted++
-				}
-
-				_, err := segs.WriteBatch(batch)
-				require.NoError(t, err)
-			}
-
-			// Perform some searches.
-			testDocSearches(t, segs)
-
-			// Make sure search postings cache was populated.
-			require.True(t, result.searchCache.lru.Len() > 0)
-			logger.Info("search cache populated", zap.Int("n", result.searchCache.lru.Len()))
-
-			// Start some async searches so we have searches going on while
-			// executing background compact GC.
-			doneCh := make(chan struct{}, 2)
-			defer close(doneCh)
-			for i := 0; i < 2; i++ {
-				go func() {
-					for {
-						select {
-						case <-doneCh:
-							return
-						default:
-						}
-						// Search continously.
-						testDocSearches(t, segs)
-					}
-				}()
-			}
-
-			// Explicitly background compact and make sure that background segment
-			// is GC'd of series no longer present.
-			segs.Lock()
-			segs.sealedBlockStarts[xtime.ToUnixNano(blockStart)] = struct{}{}
-			segs.backgroundCompactGCPending = true
-			segs.backgroundCompactWithLock()
-			compactingBackgroundStandard := segs.compact.compactingBackgroundStandard
-			compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
-			segs.Unlock()
-
-			// Should have kicked off a background compact GC.
-			require.True(t, compactingBackgroundStandard || compactingBackgroundGarbageCollect)
-
-			// Wait for background compact GC to run.
-			for {
-				segs.Lock()
-				compactingBackgroundStandard := segs.compact.compactingBackgroundStandard
-				compactingBackgroundGarbageCollect := segs.compact.compactingBackgroundGarbageCollect
-				segs.Unlock()
-				if !compactingBackgroundStandard && !compactingBackgroundGarbageCollect {
-					break
-				}
-				time.Sleep(100 * time.Millisecond)
-			}
-
-			logger.Info("compaction done, search cache", zap.Int("n", result.searchCache.lru.Len()))
-		})
-	}
-}
-
-func testDocSearches(
-	t *testing.T,
-	segs *mutableSegments,
-) {
-	for i := 0; i < len(testDocBucket0Values); i++ {
-		for j := 0; j < len(testDocBucket1Values); j++ {
-			readers, err := segs.AddReaders(nil)
-			assert.NoError(t, err)
-
-			regexp0 := fmt.Sprintf("(%s|%s)", moduloByteStr(testDocBucket0Values, i),
-				moduloByteStr(testDocBucket0Values, i+1))
-			b0, err := query.NewRegexpQuery([]byte(testDocBucket0Name), []byte(regexp0))
-			assert.NoError(t, err)
-
-			regexp1 := fmt.Sprintf("(%s|%s|%s)", moduloByteStr(testDocBucket1Values, j),
-				moduloByteStr(testDocBucket1Values, j+1),
-				moduloByteStr(testDocBucket1Values, j+2))
-			b1, err := query.NewRegexpQuery([]byte(testDocBucket1Name), []byte(regexp1))
-			assert.NoError(t, err)
-
-			q := query.NewConjunctionQuery([]search.Query{b0, b1})
-			searcher, err := q.Searcher()
-			assert.NoError(t, err)
-
-			for _, reader := range readers {
-				readThrough, ok := reader.(search.ReadThroughSegmentSearcher)
-				assert.True(t, ok)
-
-				pl, err := readThrough.Search(q, searcher)
-				assert.NoError(t, err)
-
-				assert.True(t, pl.CountSlow() > 0)
-			}
+func TestPopulateCachedSearchesWorkerSafeCloserReuse(t *testing.T) {
+	var all []*mockCloser
+	defer func() {
+		for _, c := range all {
+			require.Equal(t, 1, c.closed)
 		}
+	}()
+
+	w := newPopulateCachedSearchesWorker()
+	for i := 0; i < 100; i++ {
+		n := rand.Intn(64)
+		for j := 0; j < n; j++ {
+			closer := &mockCloser{}
+			all = append(all, closer)
+			w.addCloser(closer)
+		}
+		w.close()
 	}
 }
 
-var (
-	testDocBucket0Name   = "bucket_0"
-	testDocBucket0Values = []string{
-		"one",
-		"two",
-		"three",
-	}
-	testDocBucket1Name   = "bucket_1"
-	testDocBucket1Values = []string{
-		"one",
-		"two",
-		"three",
-		"four",
-		"five",
-	}
-)
+var _ io.Closer = (*mockCloser)(nil)
 
-func testDocN(n int) doc.Metadata {
-	return doc.Metadata{
-		ID: []byte(fmt.Sprintf("doc-%d", n)),
-		Fields: []doc.Field{
-			{
-				Name:  []byte("foo"),
-				Value: []byte("bar"),
-			},
-			{
-				Name:  []byte(testDocBucket0Name),
-				Value: moduloByteStr(testDocBucket0Values, n),
-			},
-			{
-				Name:  []byte(testDocBucket1Name),
-				Value: moduloByteStr(testDocBucket1Values, n),
-			},
-		},
-	}
+type mockCloser struct {
+	closed int
 }
 
-func moduloByteStr(strs []string, n int) []byte {
-	return []byte(strs[n%len(strs)])
+func (c *mockCloser) Close() error {
+	c.closed++
+	return nil
 }

From 6379d0b0689c31227e6534105c3229be4b83b019 Mon Sep 17 00:00:00 2001
From: Rob Skillington <rob@chronosphere.io>
Date: Wed, 12 May 2021 18:25:29 -0400
Subject: [PATCH 106/106] If tag name is a graphite tag then do not allocate
 each time

---
 src/dbnode/storage/index/convert/convert.go | 23 ++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/dbnode/storage/index/convert/convert.go b/src/dbnode/storage/index/convert/convert.go
index 93465ff219..ac84641db7 100644
--- a/src/dbnode/storage/index/convert/convert.go
+++ b/src/dbnode/storage/index/convert/convert.go
@@ -134,9 +134,9 @@ func FromSeriesIDAndTags(id ident.ID, tags ident.Tags) (doc.Metadata, error) {
 
 		var clonedName, clonedValue []byte
 		clonedName, expectedStart = findSliceOrClone(clonedID, nameBytes, expectedStart,
-			distanceBetweenTagNameAndValue)
+			distanceBetweenTagNameAndValue, true)
 		clonedValue, expectedStart = findSliceOrClone(clonedID, valueBytes, expectedStart,
-			distanceBetweenTagValueAndNextName)
+			distanceBetweenTagValueAndNextName, false)
 
 		fields = append(fields, doc.Field{
 			Name:  clonedName,
@@ -167,9 +167,9 @@ func FromSeriesIDAndTagIter(id ident.ID, tags ident.TagIterator) (doc.Metadata,
 
 		var clonedName, clonedValue []byte
 		clonedName, expectedStart = findSliceOrClone(clonedID, nameBytes, expectedStart,
-			distanceBetweenTagNameAndValue)
+			distanceBetweenTagNameAndValue, true)
 		clonedValue, expectedStart = findSliceOrClone(clonedID, valueBytes, expectedStart,
-			distanceBetweenTagValueAndNextName)
+			distanceBetweenTagValueAndNextName, false)
 
 		fields = append(fields, doc.Field{
 			Name:  clonedName,
@@ -248,9 +248,9 @@ func FromSeriesIDAndEncodedTags(id ident.BytesID, encodedTags ts.EncodedTags) (d
 
 		var clonedName, clonedValue []byte
 		clonedName, expectedStart = findSliceOrClone(clonedID, bytesName, expectedStart,
-			distanceBetweenTagNameAndValue)
+			distanceBetweenTagNameAndValue, true)
 		clonedValue, expectedStart = findSliceOrClone(clonedID, bytesValue, expectedStart,
-			distanceBetweenTagValueAndNextName)
+			distanceBetweenTagValueAndNextName, false)
 
 		fields = append(fields, doc.Field{
 			Name:  clonedName,
@@ -268,7 +268,16 @@ func FromSeriesIDAndEncodedTags(id ident.BytesID, encodedTags ts.EncodedTags) (d
 	return d, nil
 }
 
-func findSliceOrClone(id, tag []byte, expectedStart, nextPositionDistance int) ([]byte, int) { //nolint:unparam
+func findSliceOrClone(
+	id, tag []byte,
+	expectedStart, nextPositionDistance int,
+	tagName bool,
+) ([]byte, int) { //nolint:unparam
+	if tagName {
+		if idx, ok := graphite.TagIndex(tag); ok {
+			return graphite.TagName(idx), -1
+		}
+	}
 	n := len(tag)
 	expectedEnd := expectedStart + n
 	if expectedStart != -1 && expectedEnd <= len(id) &&