Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reload search blocks and replay search WAL #1000

Merged
merged 23 commits into from
Oct 6, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
cdec27a
Checkpoint: Initial implementation of v2 search WAL
annanay25 Sep 29, 2021
6738a72
better handling of reload of blocks without search data
annanay25 Sep 29, 2021
52e5f61
Checkpoint
annanay25 Sep 30, 2021
4e05804
another commit another wal replay
annanay25 Sep 30, 2021
dfbbd02
wip: ingester search test
annanay25 Sep 30, 2021
5fa8cc9
Fix Rescan search blocks, move ParseFilename into wal folder
annanay25 Oct 1, 2021
1823cb1
Append block uses new ParseFilename
annanay25 Oct 1, 2021
18d2e64
Add tests, benchmarks, pass encoding along correctly
annanay25 Oct 1, 2021
481d51f
Changelog
annanay25 Oct 1, 2021
82cab7f
Merge branch 'main' into reload-search-blocks
annanay25 Oct 1, 2021
99506a3
Post merge cleanup
annanay25 Oct 1, 2021
98c7e1b
Err handling for search disabled
annanay25 Oct 1, 2021
29d8eaf
Use the right level package, reload backend search blocks
annanay25 Oct 1, 2021
195b2fa
never refactor variables using an ide
annanay25 Oct 1, 2021
0747e09
Address comments, fix test
annanay25 Oct 1, 2021
b5e5941
Reuse StreamingSearchBlock iterator in search, relocate dedupe test
mdisibio Oct 4, 2021
5a163fe
Make wal search encoding configurable, default to gzip like backend b…
mdisibio Oct 5, 2021
de3a1e2
Make wal search encoding configurable, default to gzip like backend b…
mdisibio Oct 5, 2021
1aaca42
Simplify some search tests which were doing more work than seemed nec…
mdisibio Oct 5, 2021
5b7485f
Comment out flaky test as discussed
mdisibio Oct 5, 2021
97dafa2
Code review suggestions
mdisibio Oct 6, 2021
7b6a81f
Code review suggestions, add tests for ParseFileName
mdisibio Oct 6, 2021
7d6a3b2
Code review suggestions
mdisibio Oct 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 33 additions & 4 deletions modules/ingester/ingester.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import (
"sync"
"time"

"github.com/grafana/tempo/tempodb/search"

"github.com/cortexproject/cortex/pkg/ring"
"github.com/cortexproject/cortex/pkg/util/log"
"github.com/go-kit/kit/log/level"
Expand Down Expand Up @@ -106,9 +108,6 @@ func (i *Ingester) starting(ctx context.Context) error {
return fmt.Errorf("failed to rediscover local blocks %w", err)
}

// Search data is considered experimental and removed on every startup.
i.clearSearchData()

// Now that user states have been created, we can start the lifecycler.
// Important: we want to keep lifecycler running until we ask it to stop, so we need to give it independent context
if err := i.lifecycler.StartAsync(context.Background()); err != nil {
Expand Down Expand Up @@ -335,6 +334,30 @@ func (i *Ingester) replayWal() error {
return fmt.Errorf("fatal error replaying wal %w", err)
}

searchBlocks, err := search.RescanBlocks(i.store.WAL().GetFilepath())
if err != nil {
return fmt.Errorf("fatal error replaying search wal %w", err)
}

// clear any searchBlock that does not have a corresponding entry in tracesBlocks
annanay25 marked this conversation as resolved.
Show resolved Hide resolved
for j := len(searchBlocks) - 1; j >= 0; j-- {
clear := true
for _, tracesBlock := range blocks {
if searchBlocks[j].BlockID == tracesBlock.BlockID() {
clear = false
break
}
}

if clear {
err := searchBlocks[j].Clear()
if err != nil { // just log the error
level.Warn(log.Logger).Log("msg", "error clearing search WAL file", "blockID", searchBlocks[j].BlockID, "err", err)
}
searchBlocks = append(searchBlocks[:j], searchBlocks[j+1:]...)
}
}

for _, b := range blocks {
tenantID := b.Meta().TenantID

Expand All @@ -354,7 +377,13 @@ func (i *Ingester) replayWal() error {
return err
}

instance.AddCompletingBlock(b)
var searchWALBlock *search.StreamingSearchBlock
for _, s := range searchBlocks {
if b.BlockID() == s.BlockID {
searchWALBlock = s
mdisibio marked this conversation as resolved.
Show resolved Hide resolved
}
}
instance.AddCompletingBlock(b, searchWALBlock)

i.enqueue(&flushOp{
kind: opKindComplete,
Expand Down
10 changes: 0 additions & 10 deletions modules/ingester/ingester_search.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ package ingester
import (
"context"

"github.com/cortexproject/cortex/pkg/util/log"
"github.com/go-kit/kit/log/level"
"github.com/grafana/tempo/pkg/tempopb"
"github.com/weaveworks/common/user"
)
Expand Down Expand Up @@ -66,11 +64,3 @@ func (i *Ingester) SearchTagValues(ctx context.Context, req *tempopb.SearchTagVa

return resp, nil
}

func (i *Ingester) clearSearchData() {
// clear wal
err := i.store.WAL().ClearFolder(searchDir)
if err != nil {
level.Error(log.Logger).Log("msg", "error clearing search data from wal")
}
}
36 changes: 36 additions & 0 deletions modules/ingester/ingester_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,42 @@ func TestWal(t *testing.T) {
}
}

//func TestSearchWAL(t *testing.T) {
annanay25 marked this conversation as resolved.
Show resolved Hide resolved
// tmpDir := t.TempDir()
//
// i, _, _ := defaultIngester(t, tmpDir)
// inst, ok := i.getInstanceByID("test")
// require.True(t, ok)
//
// // Write wal
// err := inst.CutCompleteTraces(0, true)
// require.NoError(t, err)
// _, err = inst.CutBlockIfReady(0, 0, true)
// require.NoError(t, err)
//
// // assert that search WAL is being searched
// ctx := user.InjectOrgID(context.Background(), "test")
// searchReq := &tempopb.SearchRequest{Tags: map[string]string{
// search.SecretExhaustiveSearchTag: "",
// }}
// results, err := inst.Search(ctx, searchReq)
// assert.NoError(t, err)
// assert.Greater(t, results.Metrics.InspectedTraces, 0)
//
// // Shutdown
// err = i.stopping(nil)
// require.NoError(t, err)
//
// // replay wal
// i, _, _ = defaultIngester(t, tmpDir)
// inst, ok = i.getInstanceByID("test")
// require.True(t, ok)
//
// results, err = inst.Search(ctx, searchReq)
// assert.NoError(t, err)
// assert.Greater(t, results.Metrics.InspectedTraces, 0)
//}

// TestWalReplayDeletesLocalBlocks simulates the condition where an ingester restarts after a wal is completed
// to the local disk, but before the wal is deleted. On startup both blocks exist, and the ingester now errs
// on the side of caution and chooses to replay the wal instead of rediscovering the local block.
Expand Down
10 changes: 8 additions & 2 deletions modules/ingester/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,11 +447,17 @@ func (i *instance) FindTraceByID(ctx context.Context, id []byte) (*tempopb.Trace
// AddCompletingBlock adds an AppendBlock directly to the slice of completing blocks.
// This is used during wal replay. It is expected that calling code will add the appropriate
// jobs to the queue to eventually flush these.
func (i *instance) AddCompletingBlock(b *wal.AppendBlock) {
func (i *instance) AddCompletingBlock(b *wal.AppendBlock, s *search.StreamingSearchBlock) {
i.blocksMtx.Lock()
defer i.blocksMtx.Unlock()

i.completingBlocks = append(i.completingBlocks, b)

// search WAL
if s == nil {
return
}
i.searchAppendBlocks[b] = &searchStreamingBlockEntry{b: s}
}

// getOrCreateTrace will return a new trace object for the given request
Expand Down Expand Up @@ -623,7 +629,7 @@ func (i *instance) rediscoverLocalBlocks(ctx context.Context) error {

i.blocksMtx.Lock()
i.completeBlocks = append(i.completeBlocks, ib)
//i.searchCompleteBlocks[ib] = sb
//i.searchCompleteBlocks[ib] = &searchLocalBlockEntry{b: sb}
i.blocksMtx.Unlock()

level.Info(log.Logger).Log("msg", "reloaded local block", "tenantID", i.instanceID, "block", id.String(), "flushed", ib.FlushedTime())
Expand Down
120 changes: 120 additions & 0 deletions tempodb/search/rescan_blocks.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package search

import (
"io/ioutil"
"os"
"path/filepath"
"strings"
"time"

cortex_util "github.com/cortexproject/cortex/pkg/util/log"
"github.com/go-kit/log/level"
"github.com/google/uuid"

"github.com/grafana/tempo/pkg/tempofb"
"github.com/grafana/tempo/tempodb/backend"
"github.com/grafana/tempo/tempodb/encoding"
"github.com/grafana/tempo/tempodb/wal"
)

// RescanBlocks scans through the search directory in the WAL folder and replays files
// todo: copied from wal.RescanBlocks(), see if we can reduce duplication?
func RescanBlocks(walPath string) ([]*StreamingSearchBlock, error) {
searchFilepath := filepath.Join(walPath, "search")
files, err := ioutil.ReadDir(searchFilepath)
if err != nil {
return nil, err
}

blocks := make([]*StreamingSearchBlock, 0, len(files))
for _, f := range files {
if f.IsDir() {
continue
}
start := time.Now()
level.Info(cortex_util.Logger).Log("msg", "beginning replay", "file", f.Name(), "size", f.Size())

b, warning, err := newStreamingSearchBlockFromWALReplay(f.Name())

remove := false
if err != nil {
// wal replay failed, clear and warn
level.Warn(cortex_util.Logger).Log("msg", "failed to replay block. removing.", "file", f.Name(), "err", err)
remove = true
}

if b != nil && b.appender.Length() == 0 {
level.Warn(cortex_util.Logger).Log("msg", "empty wal file. ignoring.", "file", f.Name(), "err", err)
remove = true
}

if warning != nil {
level.Warn(cortex_util.Logger).Log("msg", "received warning while replaying block. partial replay likely.", "file", f.Name(), "warning", warning, "records", b.appender.Length())
}

if remove {
err = os.Remove(filepath.Join(searchFilepath, f.Name()))
if err != nil {
return nil, err
}
continue
}

level.Info(cortex_util.Logger).Log("msg", "replay complete", "file", f.Name(), "duration", time.Since(start))

blocks = append(blocks, b)
}

return blocks, nil
}

// newStreamingSearchBlockFromWALReplay creates a StreamingSearchBlock with in-memory records from a search WAL file
func newStreamingSearchBlockFromWALReplay(filename string) (*StreamingSearchBlock, error, error) {
f, err := os.OpenFile(filename, os.O_RDONLY, 0644)
if err != nil {
return nil, nil, err
}

blockID, _, _, _, _, err := parseFilename(filename)
annanay25 marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return nil, nil, err
}

// version is pinned to v2 for now
v, err := encoding.FromVersion("v2")
if err != nil {
return nil, nil, err
}

blockHeader := tempofb.NewSearchBlockHeaderMutable()
records, warning, err := wal.ReplayWALAndGetRecords(f, v, backend.EncNone, func(bytes []byte) error {
entry := tempofb.SearchEntryFromBytes(bytes)
blockHeader.AddEntry(entry)
return nil
})
if err != nil {
return nil, nil, err
}
return &StreamingSearchBlock{
BlockID: blockID,
file: f,
appender: encoding.NewRecordAppender(records),
header: blockHeader,
encoding: v,
}, warning, nil
}

func parseFilename(filename string) (blockID uuid.UUID, tenantID string, version string, encoding string, dataEncoding string, err error) {
splits := strings.Split(filename, ":")

if len(splits) < 6 {
return uuid.Nil, "", "", "", "", err
}

id, err := uuid.Parse(splits[0])
if err != nil {
return uuid.Nil, "", "", "", "", err
}
// todo: any other validation?
return id, splits[1], splits[2], splits[3], splits[4], nil
}
Loading