Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add timeout to bulker flush, add default case in failQueue #3986

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions internal/pkg/api/handleAck.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,14 +365,14 @@ func (ack *AckT) handleActionResult(ctx context.Context, zlog zerolog.Logger, ag

// Save action result document
if err := dl.CreateActionResult(ctx, ack.bulk, acr); err != nil {
zlog.Error().Err(err).Msg("create action result")
zlog.Error().Err(err).Str(logger.AgentID, agent.Agent.ID).Str(logger.ActionID, action.Id).Msg("create action result")
return err
}

if action.Type == TypeUpgrade {
event, _ := ev.AsUpgradeEvent()
if err := ack.handleUpgrade(ctx, zlog, agent, event); err != nil {
zlog.Error().Err(err).Msg("handle upgrade event")
zlog.Error().Err(err).Str(logger.AgentID, agent.Agent.ID).Str(logger.ActionID, action.Id).Msg("handle upgrade event")
return err
}
}
Expand Down Expand Up @@ -634,6 +634,8 @@ func (ack *AckT) handleUpgrade(ctx context.Context, zlog zerolog.Logger, agent *
zlog.Info().
Str("lastReportedVersion", agent.Agent.Version).
Str("upgradedAt", now).
Str(logger.AgentID, agent.Agent.ID).
Str(logger.ActionID, event.ActionId).
Msg("ack upgrade")

return nil
Expand Down
3 changes: 3 additions & 0 deletions internal/pkg/api/handleCheckin.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,8 @@ func (ct *CheckinT) validateRequest(zlog zerolog.Logger, w http.ResponseWriter,
}

func (ct *CheckinT) ProcessRequest(zlog zerolog.Logger, w http.ResponseWriter, r *http.Request, start time.Time, agent *model.Agent, ver string) error {
zlog = zlog.With().
Str(logger.AgentID, agent.Id).Logger()
validated, err := ct.validateRequest(zlog, w, r, start, agent)
if err != nil {
return err
Expand Down Expand Up @@ -338,6 +340,7 @@ func (ct *CheckinT) ProcessRequest(zlog zerolog.Logger, w http.ResponseWriter, r
actions, ackToken = convertActions(zlog, agent.Id, pendingActions)

span, ctx := apm.StartSpan(r.Context(), "longPoll", "process")

if len(actions) == 0 {
LOOP:
for {
Expand Down
30 changes: 22 additions & 8 deletions internal/pkg/bulk/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,14 @@ type Bulker struct {
}

const (
defaultFlushInterval = time.Second * 5
defaultFlushThresholdCnt = 32768
defaultFlushThresholdSz = 1024 * 1024 * 10
defaultMaxPending = 32
defaultBlockQueueSz = 32 // Small capacity to allow multiOp to spin fast
defaultAPIKeyMaxParallel = 32
defaultApikeyMaxReqSize = 100 * 1024 * 1024
defaultFlushInterval = time.Second * 5
defaultFlushThresholdCnt = 32768
defaultFlushThresholdSz = 1024 * 1024 * 10
defaultMaxPending = 32
defaultBlockQueueSz = 32 // Small capacity to allow multiOp to spin fast
defaultAPIKeyMaxParallel = 32
defaultApikeyMaxReqSize = 100 * 1024 * 1024
defaultFlushContextTimeout = time.Minute * 1
)

func NewBulker(es esapi.Transport, tracer *apm.Tracer, opts ...BulkOpt) *Bulker {
Expand Down Expand Up @@ -416,6 +417,7 @@ func (b *Bulker) Run(ctx context.Context) error {
Int("itemCnt", itemCnt).
Int("byteCnt", byteCnt).
Msg("Flush on timer")

err = doFlush()

case <-ctx.Done():
Expand Down Expand Up @@ -443,7 +445,11 @@ func (b *Bulker) flushQueue(ctx context.Context, w *semaphore.Weighted, queue qu
Str("queue", queue.Type()).
Msg("flushQueue Wait")

ctx, cancel := context.WithTimeout(ctx, defaultFlushContextTimeout)
defer cancel()

if err := w.Acquire(ctx, 1); err != nil {
zerolog.Ctx(ctx).Error().Err(err).Msg("flushQueue Wait error")
return err
}

Expand All @@ -458,6 +464,10 @@ func (b *Bulker) flushQueue(ctx context.Context, w *semaphore.Weighted, queue qu
go func() {
start := time.Now()

// deadline prevents bulker being blocked on flush
ctx, cancel := context.WithTimeout(ctx, defaultFlushContextTimeout)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The context we are wrapping here has a timeout associated with it from line 448.
Is that what we want in this case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right, probably should be separate timeout

Copy link
Contributor Author

@juliaElastic juliaElastic Nov 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

made the change, I had a passing 10k and 50k run: https://buildkite.com/elastic/observability-perf/builds/3745

75k run stuck in upgrading: https://buildkite.com/elastic/observability-perf/builds/3746#019377e3-7719-494b-a952-86c06b748960
I'm only seeing deadline errors with the message find action which comes from handleAck, not the bulker
https://platform-logging.kb.us-west2.gcp.elastic-cloud.com/app/r/s/4Xwhw

I don't know though how much this timeout helps, since it is hard to reproduce it expiring.

In the last few weeks the scale tests on main (without the changes in this pr) were quite stable, and only failed for 30k+ runs on the upgrade step: https://buildkite.com/elastic/observability-perf/builds?branch=main

defer cancel()

if b.tracer != nil {
trans := b.tracer.StartTransaction(fmt.Sprintf("Flush queue %s", queue.Type()), "bulker")
trans.Context.SetLabel("queue.size", queue.cnt)
Expand Down Expand Up @@ -502,8 +512,12 @@ func (b *Bulker) flushQueue(ctx context.Context, w *semaphore.Weighted, queue qu
func failQueue(queue queueT, err error) {
for n := queue.head; n != nil; {
next := n.next // 'n' is invalid immediately on channel send
n.ch <- respT{
select {
case n.ch <- respT{
err: err,
}:
default:
panic("Unexpected blocked response channel on failQueue")
}
n = next
}
Expand Down
Loading