Skip to content

Commit

Permalink
Bug 1644364 - Move logic to limit max retries on recoverable failures…
Browse files Browse the repository at this point in the history
… to glean-core (#1120)
  • Loading branch information
Beatriz Rizental authored Aug 4, 2020
1 parent 8838047 commit 78bee73
Show file tree
Hide file tree
Showing 10 changed files with 138 additions and 72 deletions.
1 change: 1 addition & 0 deletions .dictionary
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ docstrings
dokka
dropdown
enqueue
enqueued
enum
envs
exe
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@
[Full changelog](https://github.com/mozilla/glean/compare/v32.0.0...main)

* General
* Move logic to limit the number of retries on ping uploading "recoverable failures" to glean-core. ([#1120](https://github.com/mozilla/glean/pull/1120))
* The functionality to limit the number of retries in these cases was introduced to the Glean SDK in `v31.1.0`. The work done now was to move that logic to the glean-core in order to avoid code duplication throughout the language bindings.
* Update `glean_parser` to `v1.28.3`
* BUGFIX: Generate valid C# code when using Labeled metric types.
* BUGFIX: Support `HashSet` and `Dictionary` in the C# generated code.
* C#
* Add support for the String List metric type.
* Enable generating the C# APIs using the glean_parser.
* Python
* BUGFIX: Limit the number of retries for 5xx server errors on ping uploads. ([#1120](https://github.com/mozilla/glean/pull/1120))
* This kinds of failures yield a "recoverable error", which means the ping gets re-enqueued. That can cause infinite loops on the ping upload worker. For python we were incorrectly only limiting the number of retries for I/O errors, another type of "recoverable error".

# v32.0.0 (2020-08-03)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,15 @@ internal sealed class PingUploadTask {
object Wait : PingUploadTask()

/**
* A flag signaling that the pending pings queue is empty and requester is done.
* A flag signaling that requester doesn't need to request any more upload tasks at this moment.
*
* There are two possibilities for this scenario:
* * Pending pings queue is empty, no more pings to request;
* * Requester has reported more max recoverable upload failures on the same uploading_window[1]
* and should stop requesting at this moment.
*
* [1]: An "uploading window" starts when a requester gets a new `PingUploadTask::Upload(PingRequest)`
* response and finishes when they finally get a `PingUploadTask::Done` or `PingUploadTask::Wait` response.
*/
object Done : PingUploadTask()
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import mozilla.telemetry.glean.Glean
import mozilla.telemetry.glean.net.FfiPingUploadTask
import mozilla.telemetry.glean.utils.testFlushWorkManagerJob
import mozilla.telemetry.glean.net.PingUploadTask
import mozilla.telemetry.glean.rust.Constants

/**
* Build the constraints around which the worker can be run, such as whether network
Expand Down Expand Up @@ -53,8 +52,6 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
companion object {
internal const val PING_WORKER_TAG = "mozac_service_glean_ping_upload_worker"

internal const val MAX_RETRIES = 3

/**
* Function to aid in properly enqueuing the worker in [WorkManager]
*
Expand Down Expand Up @@ -98,11 +95,6 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
*/
@Suppress("ReturnCount")
override fun doWork(): Result {
// We counte the number of failures,
// if we get more than three failures in upload we will stop this worker.
//
// This is a hack before a more robust mechanism is implemented in Rust.
var uploadFailures = 0
do {
// Create a slot of memory for the task: glean-core will write data into
// the allocated memory.
Expand All @@ -121,17 +113,14 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
Glean.configuration
).toFfi()

if (result == Constants.UPLOAD_RESULT_RECOVERABLE) {
uploadFailures++
}

// Process the upload response
LibGleanFFI.INSTANCE.glean_process_ping_upload_response(incomingTask, result)
}
PingUploadTask.Wait -> return Result.retry()
PingUploadTask.Done -> return Result.success()
}
} while (uploadFailures < MAX_RETRIES)
return Result.failure()
} while (true)
// Limits are enforced by glean-core to avoid an inifinite loop here.
// Whenever a limit is reached, this binding will receive `PingUploadTask.Done` and step out.
}
}
13 changes: 3 additions & 10 deletions glean-core/csharp/Glean/Net/BaseUploader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ internal class BaseUploader
// How many times to attempt waiting when told to by glean-core's upload API.
private const int MAX_WAIT_ATTEMPTS = 3;

// Maximum number of recoverable errors allowed before aborting the ping uploader.
private const int MAX_RETRIES = 3;

private readonly IPingUploader uploader;

/// <summary>
Expand Down Expand Up @@ -100,9 +97,10 @@ internal void TriggerUpload(Configuration config)
// FOR TESTING Implement the upload worker here and call this from Glean.cs

int waitAttempts = 0;
int uploadFailures = 0;

while (uploadFailures < MAX_RETRIES)
// Limits are enforced by glean-core to avoid an inifinite loop here.
// Whenever a limit is reached, this binding will receive `UploadTaskTag.Done` and step out.
while (true)
{
FfiUploadTask incomingTask = new FfiUploadTask();
LibGleanFFI.glean_get_upload_task(ref incomingTask);
Expand All @@ -123,11 +121,6 @@ internal void TriggerUpload(Configuration config)
// Delegate the actual upload and get its return value.
UploadResult result = Upload(path, body, headers, config);

if (result is RecoverableFailure)
{
uploadFailures += 1;
}

// Copy the `FfiUploadTask` to unmanaged memory, because
// `glean_process_ping_upload` assumes it has to free the memory.
IntPtr ptrCopy = Marshal.AllocHGlobal(Marshal.SizeOf(incomingTask));
Expand Down
11 changes: 3 additions & 8 deletions glean-core/ios/Glean/Net/HttpPingUploader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ public class HttpPingUploader {
static let recoverableErrorStatusCode: UInt16 = 500
// For this error, the ping data will be deleted and no retry happens
static let unrecoverableErrorStatusCode: UInt16 = 400

// Maximum number of recoverable errors allowed before aborting the ping uploader
static let maxRetries = 3
}

private let logger = Logger(tag: Constants.logTag)
Expand Down Expand Up @@ -98,18 +95,16 @@ public class HttpPingUploader {
/// It will report back the task status to Glean, which will take care of deleting pending ping files.
/// It will continue upload as long as it can fetch new tasks.
func process() {
var uploadFailures = 0
while uploadFailures < Constants.maxRetries {
// Limits are enforced by glean-core to avoid an inifinite loop here.
// Whenever a limit is reached, this binding will receive `.done` and step out.
while true {
var incomingTask = FfiPingUploadTask()
glean_get_upload_task(&incomingTask)
let task = incomingTask.toPingUploadTask()

switch task {
case let .upload(request):
self.upload(path: request.path, data: request.body, headers: request.headers) { result in
if case .recoverableFailure = result {
uploadFailures += 1
}
glean_process_ping_upload_response(&incomingTask, result.toFfi())
}
case .wait:
Expand Down
17 changes: 4 additions & 13 deletions glean-core/python/glean/net/ping_upload_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from .._glean_ffi import ffi as ffi_support # type: ignore
from .._dispatcher import Dispatcher
from .._process_dispatcher import ProcessDispatcher
from .ping_uploader import RecoverableFailure


log = logging.getLogger(__name__)
Expand All @@ -26,9 +25,6 @@
# How many times to attempt waiting when told to by glean-core's upload API.
MAX_WAIT_ATTEMPTS = 3

# Maximum number of recoverable errors allowed before aborting the ping uploader
MAX_RETRIES = 3


class PingUploadWorker:
@classmethod
Expand Down Expand Up @@ -120,9 +116,9 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:

wait_attempts = 0

upload_failures = 0

while upload_failures < MAX_RETRIES:
# Limits are enforced by glean-core to avoid an inifinite loop here.
# Whenever a limit is reached, this binding will receive `UploadTaskTag.DONE` and step out.
while True:
incoming_task = ffi_support.new("FfiPingUploadTask *")
_ffi.lib.glean_get_upload_task(incoming_task)

Expand All @@ -146,9 +142,6 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:
url_path, body, _parse_ping_headers(headers, doc_id), configuration
)

if isinstance(upload_result, RecoverableFailure):
upload_failures = upload_failures + 1

# Process the response.
_ffi.lib.glean_process_ping_upload_response(
incoming_task, upload_result.to_ffi()
Expand All @@ -161,9 +154,7 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:
else:
return False
elif tag == UploadTaskTag.DONE:
break

return True
return True


__all__ = ["PingUploadWorker"]
12 changes: 7 additions & 5 deletions glean-core/python/tests/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,12 @@ def test_500_error_submit(safe_httpserver, monkeypatch):
ProcessDispatcher._wait_for_last_process()

# This kind of recoverable error will be tried 10 times
assert 10 == len(safe_httpserver.requests)
# The number of retries is defined on glean-core
assert 3 == len(safe_httpserver.requests)

metric = get_upload_failure_metric()
assert not metric["status_code_4xx"].test_has_value()
assert 10 == metric["status_code_5xx"].test_get_value()
assert 3 == metric["status_code_5xx"].test_get_value()


def test_500_error_submit_concurrent_writing(slow_httpserver, monkeypatch):
Expand Down Expand Up @@ -113,12 +114,13 @@ def test_500_error_submit_concurrent_writing(slow_httpserver, monkeypatch):
counter.add()
times += 1

# This kind of recoverable error will be tried 10 times
assert 10 == len(slow_httpserver.requests)
# This kind of recoverable error will be tried 3 times
# The number of retries is defined on glean-core
assert 3 == len(slow_httpserver.requests)

metric = get_upload_failure_metric()
assert not metric["status_code_4xx"].test_has_value()
assert 10 == metric["status_code_5xx"].test_get_value()
assert 3 == metric["status_code_5xx"].test_get_value()

assert times > 0
assert times == counter.test_get_value()
Expand Down
2 changes: 1 addition & 1 deletion glean-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ impl Glean {
///
/// * `Wait` - which means the requester should ask again later;
/// * `Upload(PingRequest)` - which means there is a ping to upload. This wraps the actual request object;
/// * `Done` - which means there are no more pings queued right now.
/// * `Done` - which means requester should stop asking for now.
///
/// # Return value
///
Expand Down
Loading

1 comment on commit 78bee73

@firefoxci-taskcluster
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uh oh! Looks like an error! Details

Entity already exists

Please sign in to comment.