-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-20464][SS] Add a job group and description for streaming queries and fix cancellation of running jobs using the job group #17765
Changes from 3 commits
07e182b
bd13a01
7a58547
6ab66e2
992d68f
915d67b
f9342c9
6e66638
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -252,6 +252,7 @@ class StreamExecution( | |
*/ | ||
private def runBatches(): Unit = { | ||
try { | ||
sparkSession.sparkContext.setJobGroup(runId.toString, getBatchDescriptionString) | ||
if (sparkSession.sessionState.conf.streamingMetricsEnabled) { | ||
sparkSession.sparkContext.env.metricsSystem.registerSource(streamMetrics) | ||
} | ||
|
@@ -308,6 +309,7 @@ class StreamExecution( | |
logDebug(s"batch ${currentBatchId} committed") | ||
// We'll increase currentBatchId after we complete processing current batch's data | ||
currentBatchId += 1 | ||
sparkSession.sparkContext.setJobDescription(getBatchDescriptionString) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update job description with updated |
||
} else { | ||
currentStatus = currentStatus.copy(isDataAvailable = false) | ||
updateStatusMessage("Waiting for data to arrive") | ||
|
@@ -418,6 +420,7 @@ class StreamExecution( | |
/* First assume that we are re-executing the latest known batch | ||
* in the offset log */ | ||
currentBatchId = latestBatchId | ||
sparkSession.sparkContext.setJobDescription(getBatchDescriptionString) | ||
availableOffsets = nextOffsets.toStreamProgress(sources) | ||
/* Initialize committed offsets to a committed batch, which at this | ||
* is the second latest batch id in the offset log. */ | ||
|
@@ -463,6 +466,7 @@ class StreamExecution( | |
} | ||
} | ||
currentBatchId = latestCommittedBatchId + 1 | ||
sparkSession.sparkContext.setJobDescription(getBatchDescriptionString) | ||
committedOffsets ++= availableOffsets | ||
// Construct a new batch be recomputing availableOffsets | ||
constructNextBatch() | ||
|
@@ -478,6 +482,7 @@ class StreamExecution( | |
case None => // We are starting this stream for the first time. | ||
logInfo(s"Starting new streaming query.") | ||
currentBatchId = 0 | ||
sparkSession.sparkContext.setJobDescription(getBatchDescriptionString) | ||
constructNextBatch() | ||
} | ||
} | ||
|
@@ -684,8 +689,11 @@ class StreamExecution( | |
// intentionally | ||
state.set(TERMINATED) | ||
if (microBatchThread.isAlive) { | ||
sparkSession.sparkContext.cancelJobGroup(runId.toString) | ||
microBatchThread.interrupt() | ||
microBatchThread.join() | ||
// microBatchThread may spawn new jobs, so we need to cancel again to prevent a leak | ||
sparkSession.sparkContext.cancelJobGroup(runId.toString) | ||
} | ||
logInfo(s"Query $prettyIdString was stopped") | ||
} | ||
|
@@ -825,6 +833,11 @@ class StreamExecution( | |
} | ||
} | ||
|
||
private def getBatchDescriptionString: String = { | ||
val batchDescription = if (currentBatchId < 0) "init" else currentBatchId.toString | ||
Option(name).map(_ + " ").getOrElse("") + | ||
s"[batch = $batchDescription, id = $id, runId = $runId]" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The description looks good to me. @marmbrus what do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LGTM |
||
} | ||
} | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@brkyvz is this okay?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, this seems fine
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note here it is
setJobGroup(..., interruptOnCancel = false)
. Should we add a sql conf forinterruptOnCancel
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when would you want to set this to true or false?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Had this set to false due to HDFS-1208, but setting it to true since the HDFS bug is 7 years old.