-
Notifications
You must be signed in to change notification settings - Fork 856
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update #224 to add FSFetcher as a standalone fetcher #232
Changes from all commits
cf435dc
7c74161
ec508e7
35b6e33
1bd88d9
280b46c
725a7d2
14083ad
926c974
516b4c7
8b98ae3
ccedbc3
529d5dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* | ||
* Copyright 2016 LinkedIn Corp. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
* use this file except in compliance with the License. You may obtain a copy of | ||
* the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations under | ||
* the License. | ||
*/ | ||
|
||
package com.linkedin.drelephant.spark.fetchers | ||
|
||
import com.linkedin.drelephant.analysis.{AnalyticJob, ElephantFetcher} | ||
import com.linkedin.drelephant.configurations.fetcher.FetcherConfigurationData | ||
import com.linkedin.drelephant.spark.data.SparkApplicationData | ||
import com.linkedin.drelephant.spark.legacydata.LegacyDataConverters | ||
import org.apache.spark.deploy.history.SparkFSFetcher | ||
|
||
/** | ||
* Wraps the SparkFSFetcher which has the actual logic to comply to the new SparkApplicationData interface | ||
* @param fetcherConfigurationData | ||
*/ | ||
class FSFetcher(fetcherConfigurationData: FetcherConfigurationData) | ||
extends ElephantFetcher[SparkApplicationData] { | ||
lazy val legacyFetcher = new SparkFSFetcher(fetcherConfigurationData) | ||
|
||
override def fetchData(analyticJob: AnalyticJob): SparkApplicationData = { | ||
val legacyData = legacyFetcher.fetchData(analyticJob) | ||
LegacyDataConverters.convert(legacyData) | ||
} | ||
} | ||
|
||
object FSFetcher { | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ package com.linkedin.drelephant.spark.fetchers | |
import scala.async.Async | ||
import scala.concurrent.{Await, ExecutionContext, Future} | ||
import scala.concurrent.duration.{Duration, SECONDS} | ||
import scala.util.{Try, Success, Failure} | ||
import scala.util.control.NonFatal | ||
|
||
import com.linkedin.drelephant.analysis.{AnalyticJob, ElephantFetcher} | ||
|
@@ -36,17 +37,21 @@ import org.apache.spark.SparkConf | |
class SparkFetcher(fetcherConfigurationData: FetcherConfigurationData) | ||
extends ElephantFetcher[SparkApplicationData] { | ||
import SparkFetcher._ | ||
import Async.{async, await} | ||
import ExecutionContext.Implicits.global | ||
|
||
private val logger: Logger = Logger.getLogger(classOf[SparkFetcher]) | ||
|
||
val eventLogUri = Option(fetcherConfigurationData.getParamMap.get(LOG_LOCATION_URI_XML_FIELD)) | ||
logger.info("The event log location of Spark application is set to " + eventLogUri) | ||
|
||
private[fetchers] lazy val hadoopConfiguration: Configuration = new Configuration() | ||
|
||
private[fetchers] lazy val sparkUtils: SparkUtils = SparkUtils | ||
|
||
private[fetchers] lazy val sparkConf: SparkConf = { | ||
val sparkConf = new SparkConf() | ||
sparkUtils.getDefaultPropertiesFile(sparkUtils.defaultEnv) match { | ||
sparkUtils.getDefaultPropertiesFile() match { | ||
case Some(filename) => sparkConf.setAll(sparkUtils.getPropertiesFromFile(filename)) | ||
case None => throw new IllegalStateException("can't find Spark conf; please set SPARK_HOME or SPARK_CONF_DIR") | ||
} | ||
|
@@ -65,25 +70,51 @@ class SparkFetcher(fetcherConfigurationData: FetcherConfigurationData) | |
private[fetchers] lazy val sparkRestClient: SparkRestClient = new SparkRestClient(sparkConf) | ||
|
||
private[fetchers] lazy val sparkLogClient: SparkLogClient = { | ||
new SparkLogClient(hadoopConfiguration, sparkConf) | ||
new SparkLogClient(hadoopConfiguration, sparkConf, eventLogUri) | ||
} | ||
|
||
override def fetchData(analyticJob: AnalyticJob): SparkApplicationData = { | ||
doFetchData(analyticJob) match { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe this match block is not needed? Just call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
case Success(data) => data | ||
case Failure(e) => throw e | ||
} | ||
} | ||
|
||
private def doFetchData(analyticJob: AnalyticJob): Try[SparkApplicationData] = { | ||
val appId = analyticJob.getAppId | ||
logger.info(s"Fetching data for ${appId}") | ||
try { | ||
Await.result(doFetchData(sparkRestClient, sparkLogClient, appId, eventLogSource), | ||
DEFAULT_TIMEOUT) | ||
} catch { | ||
case NonFatal(e) => | ||
Try { | ||
Await.result(doFetchDataUsingRestAndLogClients(analyticJob), DEFAULT_TIMEOUT) | ||
}.transform( | ||
data => { | ||
logger.info(s"Succeeded fetching data for ${appId}") | ||
Success(data) | ||
}, | ||
e => { | ||
logger.error(s"Failed fetching data for ${appId}", e) | ||
throw e | ||
Failure(e) | ||
} | ||
) | ||
} | ||
|
||
private def doFetchDataUsingRestAndLogClients(analyticJob: AnalyticJob): Future[SparkApplicationData] = async { | ||
val appId = analyticJob.getAppId | ||
val restDerivedData = await(sparkRestClient.fetchData(appId, eventLogSource == EventLogSource.Rest)) | ||
|
||
val logDerivedData = eventLogSource match { | ||
case EventLogSource.None => None | ||
case EventLogSource.Rest => restDerivedData.logDerivedData | ||
case EventLogSource.WebHdfs => | ||
val lastAttemptId = restDerivedData.applicationInfo.attempts.maxBy { _.startTime }.attemptId | ||
Some(await(sparkLogClient.fetchData(appId, lastAttemptId))) | ||
} | ||
|
||
SparkApplicationData(appId, restDerivedData, logDerivedData) | ||
} | ||
|
||
} | ||
|
||
object SparkFetcher { | ||
import Async.{async, await} | ||
|
||
sealed trait EventLogSource | ||
|
||
|
@@ -97,27 +128,6 @@ object SparkFetcher { | |
} | ||
|
||
val SPARK_EVENT_LOG_ENABLED_KEY = "spark.eventLog.enabled" | ||
val DEFAULT_TIMEOUT = Duration(30, SECONDS) | ||
|
||
private def doFetchData( | ||
sparkRestClient: SparkRestClient, | ||
sparkLogClient: SparkLogClient, | ||
appId: String, | ||
eventLogSource: EventLogSource | ||
)( | ||
implicit ec: ExecutionContext | ||
): Future[SparkApplicationData] = async { | ||
val restDerivedData = await(sparkRestClient.fetchData( | ||
appId, eventLogSource == EventLogSource.Rest)) | ||
|
||
val logDerivedData = eventLogSource match { | ||
case EventLogSource.None => None | ||
case EventLogSource.Rest => restDerivedData.logDerivedData | ||
case EventLogSource.WebHdfs => | ||
val lastAttemptId = restDerivedData.applicationInfo.attempts.maxBy { _.startTime }.attemptId | ||
Some(await(sparkLogClient.fetchData(appId, lastAttemptId))) | ||
} | ||
|
||
SparkApplicationData(appId, restDerivedData, logDerivedData) | ||
} | ||
val DEFAULT_TIMEOUT = Duration(60, SECONDS) | ||
val LOG_LOCATION_URI_XML_FIELD = "event_log_location_uri" | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we need the legacy package and other legacy classes in it. Since we will be using the file system fetcher, so it's no longer a legacy code. We should instead have all the relevant classes in this fetchers package.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The way SparkFSFetcher reads the event logs needs to be revisioned to not rely on older API like replaybus. That's why I have kept it as legacy for now. I will fix those and then move it to fetchers in a separate PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright. That makes sense.
Thank you.