Skip to content

Commit

Permalink
[SPARK-30285][CORE] Fix deadlock between LiveListenerBus#stop and Asy…
Browse files Browse the repository at this point in the history
…ncEventQueue#removeListenerOnError

### What changes were proposed in this pull request?

There is a deadlock between `LiveListenerBus#stop` and `AsyncEventQueue#removeListenerOnError`.

We can reproduce as follows:

1. Post some events to `LiveListenerBus`
2. Call `LiveListenerBus#stop` and hold the synchronized lock of `bus`(https://github.com/apache/spark/blob/5e92301723464d0876b5a7eec59c15fed0c5b98c/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala#L229), waiting until all the events are processed by listeners, then remove all the queues
3. Event queue would drain out events by posting to its listeners. If a listener is interrupted, it will call `AsyncEventQueue#removeListenerOnError`,  inside it will call `bus.removeListener`(https://github.com/apache/spark/blob/7b1b60c7583faca70aeab2659f06d4e491efa5c0/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala#L207), trying to acquire synchronized lock of bus, resulting in deadlock

This PR  removes the `synchronized` from `LiveListenerBus.stop` because underlying data structures themselves are thread-safe.

### Why are the changes needed?
To fix deadlock.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
New UT.

Closes #26924 from wangshuo128/event-queue-race-condition.

Authored-by: Wang Shuo <wangshuo128@gmail.com>
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
  • Loading branch information
wangshuo128 authored and Marcelo Vanzin committed Jan 3, 2020
1 parent 1b0570c commit 10cae04
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -226,10 +226,8 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
return
}

synchronized {
queues.asScala.foreach(_.stop())
queues.clear()
}
queues.asScala.foreach(_.stop())
queues.clear()
}

// For testing only.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,47 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
}
}

Seq(true, false).foreach { throwInterruptedException =>
val suffix = if (throwInterruptedException) "throw interrupt" else "set Thread interrupted"
test(s"SPARK-30285: Fix deadlock in AsyncEventQueue.removeListenerOnError: $suffix") {
val LISTENER_BUS_STOP_WAITING_TIMEOUT_MILLIS = 10 * 1000L // 10 seconds
val bus = new LiveListenerBus(new SparkConf(false))
val counter1 = new BasicJobCounter()
val counter2 = new BasicJobCounter()
val interruptingListener = new DelayInterruptingJobCounter(throwInterruptedException, 3)
bus.addToSharedQueue(counter1)
bus.addToSharedQueue(interruptingListener)
bus.addToEventLogQueue(counter2)
assert(bus.activeQueues() === Set(SHARED_QUEUE, EVENT_LOG_QUEUE))
assert(bus.findListenersByClass[BasicJobCounter]().size === 2)
assert(bus.findListenersByClass[DelayInterruptingJobCounter]().size === 1)

bus.start(mockSparkContext, mockMetricsSystem)

(0 until 5).foreach { jobId =>
bus.post(SparkListenerJobEnd(jobId, jobCompletionTime, JobSucceeded))
}

// Call bus.stop in a separate thread, otherwise we will block here until bus is stopped
val stoppingThread = new Thread(() => {
bus.stop()
})
stoppingThread.start()
// Notify interrupting listener starts to work
interruptingListener.sleep = false
// Wait for bus to stop
stoppingThread.join(LISTENER_BUS_STOP_WAITING_TIMEOUT_MILLIS)

// Stopping has been finished
assert(stoppingThread.isAlive === false)
// All queues are removed
assert(bus.activeQueues() === Set.empty)
assert(counter1.count === 5)
assert(counter2.count === 5)
assert(interruptingListener.count === 3)
}
}

test("event queue size can be configued through spark conf") {
// configure the shared queue size to be 1, event log queue size to be 2,
// and listner bus event queue size to be 5
Expand Down Expand Up @@ -627,6 +668,35 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
}
}
}

/**
* A simple listener that works as follows:
* 1. sleep and wait when `sleep` is true
* 2. when `sleep` is false, start to work:
* if it is interruptOnJobId, interrupt
* else count SparkListenerJobEnd numbers
*/
private class DelayInterruptingJobCounter(
val throwInterruptedException: Boolean,
val interruptOnJobId: Int) extends SparkListener {
@volatile var sleep = true
var count = 0

override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
while (sleep) {
Thread.sleep(10)
}
if (interruptOnJobId == jobEnd.jobId) {
if (throwInterruptedException) {
throw new InterruptedException("got interrupted")
} else {
Thread.currentThread().interrupt()
}
} else {
count += 1
}
}
}
}

// These classes can't be declared inside of the SparkListenerSuite class because we don't want
Expand Down

0 comments on commit 10cae04

Please sign in to comment.