-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-4195][Core]retry to fetch blocks's result when fetchfailed's reason is connection timeout #3061
[SPARK-4195][Core]retry to fetch blocks's result when fetchfailed's reason is connection timeout #3061
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,13 +18,15 @@ | |
package org.apache.spark.network.nio | ||
|
||
import java.nio.ByteBuffer | ||
import java.io.IOException | ||
|
||
import org.apache.spark.network._ | ||
import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} | ||
import org.apache.spark.storage.{BlockId, StorageLevel} | ||
import org.apache.spark.util.Utils | ||
import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException} | ||
|
||
import scala.collection.mutable.HashMap | ||
import scala.concurrent.Future | ||
|
||
|
||
|
@@ -39,6 +41,10 @@ final class NioBlockTransferService(conf: SparkConf, securityManager: SecurityMa | |
|
||
private var blockDataManager: BlockDataManager = _ | ||
|
||
private val blockFailedCounts = new HashMap[Seq[String], Int] | ||
|
||
val maxRetryNum = conf.getInt("spark.shuffle.fetch.maxRetryNumber", 3) | ||
|
||
/** | ||
* Port number the service is listening on, available only after [[init]] is invoked. | ||
*/ | ||
|
@@ -96,6 +102,9 @@ final class NioBlockTransferService(conf: SparkConf, securityManager: SecurityMa | |
future.onSuccess { case message => | ||
val bufferMessage = message.asInstanceOf[BufferMessage] | ||
val blockMessageArray = BlockMessageArray.fromBufferMessage(bufferMessage) | ||
blockFailedCounts.synchronized { | ||
blockFailedCounts -= blockIds | ||
} | ||
|
||
// SPARK-4064: In some cases(eg. Remote block was removed) blockMessageArray may be empty. | ||
if (blockMessageArray.isEmpty) { | ||
|
@@ -121,8 +130,28 @@ final class NioBlockTransferService(conf: SparkConf, securityManager: SecurityMa | |
}(cm.futureExecContext) | ||
|
||
future.onFailure { case exception => | ||
blockIds.foreach { blockId => | ||
listener.onBlockFetchFailure(blockId, exception) | ||
exception match { | ||
case connectExcpt: IOException => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not catch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because in https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala#L963 doesnot catch ConnectException and throw a IOException. so now in here we only catch IOException. if we are only trying to deal with connection errors, it need to catch ConnectException in ConnectionManager.scala. |
||
logWarning("Failed to connect to " + hostName + ":" + port) | ||
val failedCount = blockFailedCounts.synchronized { | ||
val newFailedCount = blockFailedCounts(blockIds).getOrElse(0) + 1 | ||
blockFailedCounts(blockIds) = newFailedCount | ||
newFailedCount | ||
} | ||
if (failedCount >= maxRetryNum) { | ||
blockFailedCounts.synchronized { | ||
blockFailedCounts -= blockIds | ||
} | ||
blockIds.foreach { blockId => | ||
listener.onBlockFetchFailure(blockId, connectExcpt) | ||
} | ||
} else { | ||
fetchBlocks(hostName, port, blockIds, listener) | ||
} | ||
case t: Throwable => | ||
blockIds.foreach { blockId => | ||
listener.onBlockFetchFailure(blockId, t) | ||
} | ||
} | ||
}(cm.futureExecContext) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To avoid memory leaks, we need to be sure that this won't grow without bound. Let me try to walk through the cases...
So, looks like this adds a memory leak?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, thank you for finding its error.