Skip to content

Commit

Permalink
RUM-3461 Track App Hang "start", "end" and "cancellation"
Browse files Browse the repository at this point in the history
  • Loading branch information
ncreated committed Mar 28, 2024
1 parent a3a1d7f commit 3af24b5
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import Foundation
///
/// - Unlike `DDCrashReport`, the backtrace report can be generated on-demand without the actual crash being triggered.
/// - Like in `DDCrashReport`, threads and stacks information in `BacktraceReport` follows the format compatible with Datadog symbolication.
public struct BacktraceReport {
public struct BacktraceReport: Codable {
/// The stack trace of the thread for which the backtrace is generated.
public let stack: String
/// Represents all threads currently running in the process.
Expand Down
10 changes: 4 additions & 6 deletions DatadogRUM/Sources/Instrumentation/AppHangs/AppHang.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ import Foundation
import DatadogInternal

/// An intermediate representation of an App Hang.
internal struct AppHang {
internal struct AppHang: Codable {
/// The result of generating backtrace for this hang.
enum BacktraceGenerationResult {
enum BacktraceGenerationResult: Codable {
/// Indicates that backtrace generation succeeded.
/// The associated `BacktraceReport` includes the snapshot of all running threads during the hang.
case succeeded(BacktraceReport)
Expand All @@ -21,10 +21,8 @@ internal struct AppHang {
case notAvailable
}

/// The date of hang end.
let date: Date
/// The duration of the hang.
let duration: TimeInterval
/// The date of hang start.
let startDate: Date
/// The result of generating backtrace for the hang.
let backtraceResult: BacktraceGenerationResult
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ internal class AppHangsObserver: RUMCommandPublisher {
backtraceReporter: backtraceReporter,
telemetry: telemetry
)
watchdogThread.onHangEnded = { [weak self] appHang in
watchdogThread.onHangEnded = { [weak self] appHang, duration in
// called on watchdog thread
self?.report(appHang: appHang)
self?.report(nonFatal: appHang, duration: duration)
}
}

Expand All @@ -59,17 +59,17 @@ internal class AppHangsObserver: RUMCommandPublisher {
self.subscriber = subscriber
}

private func report(appHang: AppHang) {
private func report(nonFatal appHang: AppHang, duration: TimeInterval) {
let command = RUMAddCurrentViewAppHangCommand(
time: appHang.date,
time: appHang.startDate,
attributes: [:],
message: Constants.appHangErrorMessage,
type: Constants.appHangErrorType,
stack: appHang.backtraceResult.stack,
threads: appHang.backtraceResult.threads,
binaryImages: appHang.backtraceResult.binaryImages,
isStackTraceTruncated: appHang.backtraceResult.wasTruncated,
hangDuration: appHang.duration
hangDuration: duration
)

subscriber?.process(command: command)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ internal final class AppHangsWatchdogThread: Thread {
/// Because the sleep might start right at the moment of hang beginning on the main thread, the watchdog thread will miss the first 50 milliseconds of hang duration.
/// If the actual hang lasts exactly 2 seconds, the watchdog will measure it as 1950 milliseconds. As a result, the SDK will not report it as it falls under `appHangThreshold`.
static let tolerance: Double = 0.025 // 2.5%

/// Hang's duration threshold to consider it a false-positive.
///
/// It is used to skip hangs with irrealistically long duration (way more than assumed iOS watchdog termination limit: ~10s).
/// This is to anticipate false-positives caused by watchdog thread suspension when app leaves the foreground.
static let falsePositiveThreshold: TimeInterval = 30
}

/// The minimal duration of the main thread hang to consider it an App Hang.
Expand All @@ -35,16 +41,27 @@ internal final class AppHangsWatchdogThread: Thread {
private let dateProvider: DateProvider
/// Backtrace reporter for hang's stack trace generation.
private let backtraceReporter: BacktraceReporting
/// The hang's duration threshold to consider it a false-positive.
private let falsePositiveThreshold: TimeInterval
/// An identifier of the main thread required for backtrace generation.
/// Because backtrace is generated from the watchdog thread, we must identify the main thread to be promoted in `BacktraceReport`. This value is
/// obtained at runtime, only once and it is cached for later use.
@ReadWriteLock
private var mainThreadID: ThreadID? = nil
/// Telemetry interface.
private let telemetry: Telemetry
/// Closure to be notified when App Hang ends. It will be executed on the watchdog thread.
/// Closure to be notified when App Hang starts.
/// It is executed on the watchdog thread.
@ReadWriteLock
internal var onHangStarted: ((AppHang) -> Void)?
/// Closure to be notified when App Hang gets cancelled due to possible false-positive.
/// It is executed on the watchdog thread.
@ReadWriteLock
internal var onHangEnded: ((AppHang) -> Void)?
internal var onHangCancelled: ((AppHang) -> Void)?
/// Closure to be notified when App Hang ends. It passes the hang and its duration.
/// It is executed on the watchdog thread.
@ReadWriteLock
internal var onHangEnded: ((AppHang, TimeInterval) -> Void)?
/// A block called after this thread finished its pass and will become idle.
@ReadWriteLock
internal var onBeforeSleep: (() -> Void)?
Expand All @@ -64,13 +81,15 @@ internal final class AppHangsWatchdogThread: Thread {
queue: DispatchQueue,
dateProvider: DateProvider,
backtraceReporter: BacktraceReporting,
telemetry: Telemetry
telemetry: Telemetry,
falsePositiveThreshold: TimeInterval = Constants.falsePositiveThreshold
) {
self.appHangThreshold = appHangThreshold
self.idleInterval = appHangThreshold * Constants.tolerance
self.mainQueue = queue
self.dateProvider = dateProvider
self.backtraceReporter = backtraceReporter
self.falsePositiveThreshold = falsePositiveThreshold
self.telemetry = telemetry

super.init()
Expand Down Expand Up @@ -100,6 +119,7 @@ internal final class AppHangsWatchdogThread: Thread {
}

let waitStart: DispatchTime = .now()
let hangStart = dateProvider.now

// Schedule task on the main thread to measure how fast it responds
mainQueue.async {
Expand Down Expand Up @@ -140,25 +160,27 @@ internal final class AppHangsWatchdogThread: Thread {
telemetry.error("Failed to generate App Hang backtrace", error: error)
}

let hang = AppHang(
startDate: hangStart,
backtraceResult: backtraceResult
)
onHangStarted?(hang)

// Previous wait timed out, so wait again for the task completion, this time infinitely until the hang ends.
mainThreadTask.wait()

// The hang has finished.
let hangDuration = interval(from: waitStart, to: .now())

if hangDuration > 30 { // sanity-check
if hangDuration > falsePositiveThreshold { // sanity-check
// If the hang duration is irrealistically long (way more than assumed iOS watchdog termination limit: ~10s), send telemetry.
// This could be another false-positive caused by thread suspension between the two `wait()` calls.
telemetry.debug("Detected an App Hang with an unusually long duration", attributes: ["hang_duration": hangDuration])
telemetry.debug("Detected an App Hang with an unusually long duration >\(falsePositiveThreshold)s", attributes: ["hang_duration": hangDuration])
onHangCancelled?(hang) // cancel hang as possible false-positive
continue
}

let appHang = AppHang(
date: dateProvider.now,
duration: hangDuration,
backtraceResult: backtraceResult
)
onHangEnded?(appHang)
onHangEnded?(hang, hangDuration)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ import DatadogInternal
@testable import DatadogRUM

class AppHangsWatchdogThreadTests: XCTestCase {
func testWhenQueueHangsAboveThreshold_itReportsAppHangs() {
let trackHangs = expectation(description: "track 3 App Hangs")
trackHangs.expectedFulfillmentCount = 3
func testWhenQueueHangsAboveThreshold_itReportsAppHangStartAndEnd() {
let trackHangStarts = expectation(description: "track start of 3 App Hangs")
trackHangStarts.expectedFulfillmentCount = 3
let trackHangEnds = expectation(description: "track end of 3 App Hangs")
trackHangEnds.expectedFulfillmentCount = 3

// Given
let appHangThreshold: TimeInterval = 0.1
Expand All @@ -26,9 +28,9 @@ class AppHangsWatchdogThreadTests: XCTestCase {
backtraceReporter: BacktraceReporterMock(),
telemetry: TelemetryMock()
)
watchdogThread.onHangEnded = { _ in
trackHangs.fulfill()
}
watchdogThread.onHangStarted = { _ in trackHangStarts.fulfill() }
watchdogThread.onHangEnded = { _, _ in trackHangEnds.fulfill() }
watchdogThread.onHangCancelled = { _ in XCTFail("It should not cancel the hang") }
watchdogThread.start()

// When (multiple hangs above threshold)
Expand Down Expand Up @@ -62,9 +64,9 @@ class AppHangsWatchdogThreadTests: XCTestCase {
backtraceReporter: BacktraceReporterMock(),
telemetry: TelemetryMock()
)
watchdogThread.onHangEnded = { _ in
doNotTrackHangs.fulfill()
}
watchdogThread.onHangStarted = { _ in doNotTrackHangs.fulfill() }
watchdogThread.onHangEnded = { _, _ in doNotTrackHangs.fulfill() }
watchdogThread.onHangCancelled = { _ in XCTFail("It should not cancel the hang") }
watchdogThread.start()

// When (multiple hangs below threshold)
Expand All @@ -84,7 +86,8 @@ class AppHangsWatchdogThreadTests: XCTestCase {
}

func testItTracksHangDateStackAndDuration() {
let trackHang = expectation(description: "track App Hang")
let trackHangStart = expectation(description: "track start of App Hang")
let trackHangEnd = expectation(description: "track end of App Hang")

// Given
let appHangThreshold: TimeInterval = 0.5
Expand All @@ -98,13 +101,19 @@ class AppHangsWatchdogThreadTests: XCTestCase {
backtraceReporter: BacktraceReporterMock(backtrace: .mockWith(stack: "Main thread stack")),
telemetry: TelemetryMock()
)
watchdogThread.onHangEnded = { hang in
XCTAssertEqual(hang.date, .mockDecember15th2019At10AMUTC())
watchdogThread.onHangStarted = { hang in
XCTAssertEqual(hang.startDate, .mockDecember15th2019At10AMUTC())
XCTAssertEqual(hang.backtraceResult.stack, "Main thread stack")
XCTAssertGreaterThanOrEqual(hang.duration, hangDuration * (1 - AppHangsWatchdogThread.Constants.tolerance))
XCTAssertLessThanOrEqual(hang.duration, hangDuration * (1 + AppHangsWatchdogThread.Constants.tolerance))
trackHang.fulfill()
trackHangStart.fulfill()
}
watchdogThread.onHangEnded = { hang, duration in
XCTAssertEqual(hang.startDate, .mockDecember15th2019At10AMUTC())
XCTAssertEqual(hang.backtraceResult.stack, "Main thread stack")
XCTAssertGreaterThanOrEqual(duration, hangDuration * (1 - AppHangsWatchdogThread.Constants.tolerance))
XCTAssertLessThanOrEqual(duration, hangDuration * (1 + AppHangsWatchdogThread.Constants.tolerance))
trackHangEnd.fulfill()
}
watchdogThread.onHangCancelled = { _ in XCTFail("It should not cancel the hang") }
watchdogThread.start()

// When
Expand All @@ -118,7 +127,8 @@ class AppHangsWatchdogThreadTests: XCTestCase {
}

func testWhenBacktraceGenerationIsNotSupported_itTracksAppHangWithErrorMessage() {
let trackHang = expectation(description: "track App Hang")
let trackHangStart = expectation(description: "track start of App Hang")
let trackHangEnd = expectation(description: "track end of App Hang")

// Given
let appHangThreshold: TimeInterval = 0.25
Expand All @@ -132,10 +142,15 @@ class AppHangsWatchdogThreadTests: XCTestCase {
backtraceReporter: BacktraceReporterMock(backtrace: nil), // backtrace generation not supported
telemetry: TelemetryMock()
)
watchdogThread.onHangEnded = { hang in
watchdogThread.onHangStarted = { hang in
XCTAssertEqual(hang.backtraceResult.stack, AppHangsObserver.Constants.appHangStackNotAvailableErrorMessage)
trackHang.fulfill()
trackHangStart.fulfill()
}
watchdogThread.onHangEnded = { hang, _ in
XCTAssertEqual(hang.backtraceResult.stack, AppHangsObserver.Constants.appHangStackNotAvailableErrorMessage)
trackHangEnd.fulfill()
}
watchdogThread.onHangCancelled = { _ in XCTFail("It should not cancel the hang") }
watchdogThread.start()

// When
Expand All @@ -149,7 +164,8 @@ class AppHangsWatchdogThreadTests: XCTestCase {
}

func testWhenBacktraceGenerationThrows_itTracksAppHangWithErrorMessage() {
let trackHang = expectation(description: "track App Hang")
let trackHangStart = expectation(description: "track start of App Hang")
let trackHangEnd = expectation(description: "track end of App Hang")

// Given
let appHangThreshold: TimeInterval = 0.25
Expand All @@ -163,9 +179,54 @@ class AppHangsWatchdogThreadTests: XCTestCase {
backtraceReporter: BacktraceReporterMock(backtraceGenerationError: NSError.mockRandom()), // backtrace generation error
telemetry: TelemetryMock()
)
watchdogThread.onHangEnded = { hang in
watchdogThread.onHangStarted = { hang in
XCTAssertEqual(hang.backtraceResult.stack, AppHangsObserver.Constants.appHangStackGenerationFailedErrorMessage)
trackHangStart.fulfill()
}
watchdogThread.onHangEnded = { hang, _ in
XCTAssertEqual(hang.backtraceResult.stack, AppHangsObserver.Constants.appHangStackGenerationFailedErrorMessage)
trackHang.fulfill()
trackHangEnd.fulfill()
}
watchdogThread.onHangCancelled = { _ in XCTFail("It should not cancel the hang") }
watchdogThread.start()

// When
queue.async {
Thread.sleep(forTimeInterval: hangDuration)
}

// Then
waitForExpectations(timeout: hangDuration * 10)
watchdogThread.cancel()
}

func testWhenHangDurationExceedsFalsePositiveThreshold_itReportsHangCancellation() {
let trackHangStart = expectation(description: "track start of App Hang")
let trackHangCancel = expectation(description: "track cancellation of App Hang")

// Given
let appHangThreshold: TimeInterval = 0.5
let hangDuration: TimeInterval = appHangThreshold * 2
let queue = DispatchQueue(label: "main-queue", qos: .userInteractive)

let watchdogThread = AppHangsWatchdogThread(
appHangThreshold: appHangThreshold,
queue: queue,
dateProvider: DateProviderMock(now: .mockDecember15th2019At10AMUTC()),
backtraceReporter: BacktraceReporterMock(backtrace: .mockWith(stack: "Main thread stack")),
telemetry: TelemetryMock(),
falsePositiveThreshold: hangDuration * 0.75
)
watchdogThread.onHangStarted = { hang in
XCTAssertEqual(hang.startDate, .mockDecember15th2019At10AMUTC())
XCTAssertEqual(hang.backtraceResult.stack, "Main thread stack")
trackHangStart.fulfill()
}
watchdogThread.onHangEnded = { _, _ in XCTFail("It should not end the hang") }
watchdogThread.onHangCancelled = { hang in
XCTAssertEqual(hang.startDate, .mockDecember15th2019At10AMUTC())
XCTAssertEqual(hang.backtraceResult.stack, "Main thread stack")
trackHangCancel.fulfill()
}
watchdogThread.start()

Expand Down

0 comments on commit 3af24b5

Please sign in to comment.