-
Notifications
You must be signed in to change notification settings - Fork 666
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
sync
-- client cleanup
#1680
Merged
Merged
sync
-- client cleanup
#1680
Changes from all commits
Commits
Show all changes
20 commits
Select commit
Hold shift + click to select a range
737d4d0
comment nits
accde38
comment nits
8beeb86
Error --> Warn log
2dd784d
nits
c68b57c
change order of return values in get
c6d57c4
remove sleep
17235ad
comment nits
49a9f16
comment nits
2b52ab3
clean up tracking of outstanding requests
a6ae3a5
comment
a852001
revert error changes
b17c2fe
simplify client get loop (#1681)
darioush 81f9dbc
nit
9b3b955
`sync` -- add exponential backoff (#1684)
8202366
nit
264f143
switch return value order for RequestAny
325e48f
Merge branch 'dev' into sync-refactor-client
0b0f248
Merge remote-tracking branch 'upstream/dev' into sync-refactor-client
96d3118
Merge branch 'sync-refactor-client' of github.com:ava-labs/avalancheg…
7adb42e
Merge branch 'dev' into sync-refactor-client
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ import ( | |
"context" | ||
"errors" | ||
"fmt" | ||
"math" | ||
"sync/atomic" | ||
"time" | ||
|
||
|
@@ -23,7 +24,9 @@ import ( | |
) | ||
|
||
const ( | ||
failedRequestSleepInterval = 10 * time.Millisecond | ||
initialRetryWait = 10 * time.Millisecond | ||
maxRetryWait = time.Second | ||
retryWaitFactor = 1.5 // Larger --> timeout grows more quickly | ||
|
||
epsilon = 1e-6 // small amount to add to time to avoid division by 0 | ||
) | ||
|
@@ -177,25 +180,24 @@ func (c *client) GetRangeProof(ctx context.Context, req *pb.SyncGetRangeProofReq | |
return getAndParse(ctx, c, reqBytes, parseFn) | ||
} | ||
|
||
// getAndParse uses [client] to send [request] to an arbitrary peer. If the peer responds, | ||
// [parseFn] is called with the raw response. If [parseFn] returns an error or the request | ||
// times out, this function will retry the request to a different peer until [ctx] expires. | ||
// If [parseFn] returns a nil error, the result is returned from getAndParse. | ||
func getAndParse[T any](ctx context.Context, client *client, request []byte, parseFn func(context.Context, []byte) (*T, error)) (*T, error) { | ||
// getAndParse uses [client] to send [request] to an arbitrary peer. | ||
// Returns the response to the request. | ||
// [parseFn] parses the raw response. | ||
// If the request is unsuccessful or the response can't be parsed, | ||
// retries the request to a different peer until [ctx] expires. | ||
func getAndParse[T any]( | ||
ctx context.Context, | ||
client *client, | ||
request []byte, | ||
parseFn func(context.Context, []byte) (*T, error), | ||
) (*T, error) { | ||
var ( | ||
lastErr error | ||
response *T | ||
) | ||
// Loop until the context is cancelled or we get a valid response. | ||
for attempt := 0; ; attempt++ { | ||
// If the context has finished, return the context error early. | ||
if err := ctx.Err(); err != nil { | ||
if lastErr != nil { | ||
return nil, fmt.Errorf("request failed after %d attempts with last error %w and ctx error %s", attempt, lastErr, err) | ||
} | ||
return nil, err | ||
} | ||
responseBytes, nodeID, err := client.get(ctx, request) | ||
for attempt := 1; ; attempt++ { | ||
nodeID, responseBytes, err := client.get(ctx, request) | ||
if err == nil { | ||
if response, err = parseFn(ctx, responseBytes); err == nil { | ||
return response, nil | ||
|
@@ -205,44 +207,66 @@ func getAndParse[T any](ctx context.Context, client *client, request []byte, par | |
client.log.Debug("request failed, retrying", | ||
zap.Stringer("nodeID", nodeID), | ||
zap.Int("attempt", attempt), | ||
zap.Error(err)) | ||
|
||
zap.Error(err), | ||
) | ||
// if [err] is being propagated from [ctx], avoid overwriting [lastErr]. | ||
if err != ctx.Err() { | ||
// if [err] is being propagated from [ctx], avoid overwriting [lastErr]. | ||
lastErr = err | ||
time.Sleep(failedRequestSleepInterval) | ||
} | ||
|
||
retryWait := initialRetryWait * time.Duration(math.Pow(retryWaitFactor, float64(attempt))) | ||
if retryWait > maxRetryWait || retryWait < 0 { // Handle overflows with negative check. | ||
retryWait = maxRetryWait | ||
} | ||
|
||
select { | ||
case <-ctx.Done(): | ||
if lastErr != nil { | ||
// prefer reporting [lastErr] if it's not nil. | ||
return nil, fmt.Errorf( | ||
"request failed after %d attempts with last error %w and ctx error %s", | ||
attempt, lastErr, ctx.Err(), | ||
) | ||
} | ||
return nil, ctx.Err() | ||
case <-time.After(retryWait): | ||
} | ||
} | ||
} | ||
|
||
// get sends [request] to an arbitrary peer and blocks until the node receives a response | ||
// or [ctx] expires. Returns the raw response from the peer, the peer's NodeID, and an | ||
// error if the request timed out. Thread safe. | ||
func (c *client) get(ctx context.Context, requestBytes []byte) ([]byte, ids.NodeID, error) { | ||
c.metrics.RequestMade() | ||
// get sends [request] to an arbitrary peer and blocks | ||
// until the node receives a response, failure notification | ||
// or [ctx] is canceled. | ||
// Returns the peer's NodeID and response. | ||
// It's safe to call this method multiple times concurrently. | ||
func (c *client) get(ctx context.Context, request []byte) (ids.NodeID, []byte, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In other places like this we return the node ID first so I did that here too |
||
var ( | ||
response []byte | ||
nodeID ids.NodeID | ||
err error | ||
startTime = time.Now() | ||
) | ||
|
||
c.metrics.RequestMade() | ||
|
||
if len(c.stateSyncNodes) == 0 { | ||
response, nodeID, err = c.networkClient.RequestAny(ctx, c.stateSyncMinVersion, requestBytes) | ||
nodeID, response, err = c.networkClient.RequestAny(ctx, c.stateSyncMinVersion, request) | ||
} else { | ||
// get the next nodeID using the nodeIdx offset. If we're out of nodes, loop back to 0 | ||
// we do this every attempt to ensure we get a different node each time if possible. | ||
// Get the next nodeID to query using the [nodeIdx] offset. | ||
// If we're out of nodes, loop back to 0. | ||
// We do this try to query a different node each time if possible. | ||
nodeIdx := atomic.AddUint32(&c.stateSyncNodeIdx, 1) | ||
nodeID = c.stateSyncNodes[nodeIdx%uint32(len(c.stateSyncNodes))] | ||
response, err = c.networkClient.Request(ctx, nodeID, requestBytes) | ||
response, err = c.networkClient.Request(ctx, nodeID, request) | ||
} | ||
if err != nil { | ||
c.metrics.RequestFailed() | ||
c.networkClient.TrackBandwidth(nodeID, 0) | ||
return response, nodeID, err | ||
return nodeID, response, err | ||
} | ||
|
||
bandwidth := float64(len(response)) / (time.Since(startTime).Seconds() + epsilon) | ||
c.networkClient.TrackBandwidth(nodeID, bandwidth) | ||
c.metrics.RequestSucceeded() | ||
return response, nodeID, nil | ||
return nodeID, response, nil | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed to avoid sleeping. It's not the end of the world to wait 10 ms to wake up in the event of context cancellation, but still seems like good practice to not sleep.