Skip to content

Commit

Permalink
Adding Auto Tuning Feature (#338)
Browse files Browse the repository at this point in the history
Co-authored-by: Arpan Agrawal <arpang@users.noreply.github.com>
Co-authored-by: Manoj Kumar <mkumar1984@users.noreply.github.com>
  • Loading branch information
3 people authored and akshayrai committed Feb 21, 2018
1 parent c182c98 commit b2c24b8
Show file tree
Hide file tree
Showing 57 changed files with 5,884 additions and 44 deletions.
6 changes: 5 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
language: scala
sudo: false
sudo: true
jdk:
- oraclejdk8
- oraclejdk7
python: "2.6"
install:
- sudo pip install inspyred
- sudo pip install argparse

# only build PRs and master (not all branch pushes)
branches:
Expand Down
10 changes: 10 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,15 @@ Copyright (c) 2016 Yehuda Katz, Tom Dale and Ember.js contributors
License: (https://github.com/emberjs/ember.js/blob/master/LICENSE)

------------------------------------------------------------------------------
Attribution for Python Libraries
------------------------------------------------------------------------------

inspyred (https://github.com/aarongarrett/inspyred/)
Copyright (c) 2017, Aaron Garrett
License: MIT License (https://github.com/aarongarrett/inspyred/blob/master/LICENSE)






67 changes: 67 additions & 0 deletions app-conf/AutoTuningConf.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright 2016 LinkedIn Corp.
Licensed under the Apache License, Version 2.0 (the "License"); you may not
use this file except in compliance with the License. You may obtain a copy of
the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and limitations under
the License.
-->

<!-- General configurations -->
<configuration>
<property>
<name>autotuning.enabled</name>
<value>false</value>
<description>Enable or disable auto tuning</description>
</property>
<property>
<name>autotuning.default.allowed_max_resource_usage_percent</name>
<value>150</value>
<description>Default value for maximum allowed overheard in resource usage (in percent) as compared to the average
value
</description>
</property>
<property>
<name>autotuning.default.allowed_max_execution_time_percent</name>
<value>150</value>
<description>Default value for maximum allowed overheard in execution time (in percent) as compared to the average
value
</description>
</property>
<property>
<name>autotuning.daemon.wait.interval.ms</name>
<value>60000</value>
<description>Auto tuning daemon wait interval</description>
</property>
<property>
<name>baseline.execution.count</name>
<value>30</value>
<description>Baseline to be done on recent number of execution</description>
</property>
<property>
<name>fitness.compute.wait_interval.ms</name>
<value>1800000</value>
<description>Wait time after the job is completed for fitness computation</description>
</property>
<property>
<name>dr.elephant.api.url</name>
<value>http://ltx1-holdemdre01.grid.linkedin.com:8080</value>
<description>API URL for Dr Elephant. Optional. This is needed only if you are using APIFitnessComputeUtil to get
fitness from Dr Elephant APIs
</description>
</property>
<!--The below property is optional-->
<!--<property>-->
<!--<name>python.path</name>-->
<!--<value>/path/to/python/binary</value>-->
<!--<description>Root directory for python</description>-->
<!--</property>-->
</configuration>
80 changes: 80 additions & 0 deletions app/com/linkedin/drelephant/AutoTuner.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright 2016 LinkedIn Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package com.linkedin.drelephant;

import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;

import com.linkedin.drelephant.analysis.HDFSContext;
import com.linkedin.drelephant.tuning.AzkabanJobCompleteDetector;
import com.linkedin.drelephant.tuning.BaselineComputeUtil;
import com.linkedin.drelephant.tuning.FitnessComputeUtil;
import com.linkedin.drelephant.tuning.JobCompleteDetector;
import com.linkedin.drelephant.tuning.PSOParamGenerator;
import com.linkedin.drelephant.tuning.ParamGenerator;
import com.linkedin.drelephant.util.Utils;

import controllers.AutoTuningMetricsController;


/**
*This class is the AutoTuner Daemon class which runs following thing in order.
* - BaselineComputeUtil: Baseline computation for new jobs which are auto tuning enabled
* - JobCompleteDetector: Detect if the current execution of the jobs is completed and update the status in DB
* - APIFitnessComputeUtil: Compute the recently succeeded jobs fitness
* - ParamGenerator : Generate the next set of parameters for suggestion
*/
public class AutoTuner implements Runnable {

public static final long ONE_MIN = 60 * 1000;
private static final Logger logger = Logger.getLogger(AutoTuner.class);
private static final long DEFAULT_METRICS_COMPUTATION_INTERVAL = ONE_MIN / 5;

public static final String AUTO_TUNING_DAEMON_WAIT_INTERVAL = "autotuning.daemon.wait.interval.ms";

public void run() {

logger.info("Starting Auto Tuning thread");
HDFSContext.load();
Configuration configuration = ElephantContext.instance().getAutoTuningConf();

Long interval =
Utils.getNonNegativeLong(configuration, AUTO_TUNING_DAEMON_WAIT_INTERVAL, DEFAULT_METRICS_COMPUTATION_INTERVAL);

try {
AutoTuningMetricsController.init();
BaselineComputeUtil baselineComputeUtil = new BaselineComputeUtil();
FitnessComputeUtil fitnessComputeUtil = new FitnessComputeUtil();
ParamGenerator paramGenerator = new PSOParamGenerator();
while (!Thread.currentThread().isInterrupted()) {
try {
baselineComputeUtil.computeBaseline();
JobCompleteDetector jobCompleteDetector = new AzkabanJobCompleteDetector();
jobCompleteDetector.updateCompletedExecutions();
fitnessComputeUtil.updateFitness();
paramGenerator.getParams();
} catch (Exception e) {
logger.error("Error in auto tuner thread ", e);
}
Thread.sleep(interval);
}
} catch (Exception e) {
logger.error("Error in auto tuner thread ", e);
}
logger.info("Auto tuning thread shutting down");
}
}
27 changes: 27 additions & 0 deletions app/com/linkedin/drelephant/DrElephant.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,52 @@

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;

import com.linkedin.drelephant.analysis.HDFSContext;


/**
* The main class which starts Dr. Elephant
*/
public class DrElephant extends Thread {
public static final String AUTO_TUNING_ENABLED = "autotuning.enabled";
private static final Logger logger = Logger.getLogger(DrElephant.class);

private ElephantRunner _elephant;
private AutoTuner _autoTuner;
private Thread _autoTunerThread;

private Boolean autoTuningEnabled;

public DrElephant() throws IOException {
HDFSContext.load();
Configuration configuration = ElephantContext.instance().getAutoTuningConf();
autoTuningEnabled = configuration.getBoolean(AUTO_TUNING_ENABLED, false);
logger.debug("Auto Tuning Configuration: " + configuration.toString());
_elephant = new ElephantRunner();
if (autoTuningEnabled) {
_autoTuner = new AutoTuner();
_autoTunerThread = new Thread(_autoTuner, "Auto Tuner Thread");
}
}

@Override
public void run() {
if (_autoTunerThread != null) {
logger.debug("Starting auto tuner thread ");
_autoTunerThread.start();
}
_elephant.run();
}

public void kill() {
if (_elephant != null) {
_elephant.kill();
}
if (_autoTunerThread != null) {
_autoTunerThread.interrupt();
}
}
}
43 changes: 31 additions & 12 deletions app/com/linkedin/drelephant/ElephantContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,21 @@ public class ElephantContext {
private static final String HEURISTICS_CONF = "HeuristicConf.xml";
private static final String JOB_TYPES_CONF = "JobTypeConf.xml";
private static final String GENERAL_CONF = "GeneralConf.xml";
private static final String AUTO_TUNING_CONF = "AutoTuningConf.xml";

private final Map<String, List<String>> _heuristicGroupedNames = new HashMap<String, List<String>>();
private List<HeuristicConfigurationData> _heuristicsConfData;
private List<FetcherConfigurationData> _fetchersConfData;
private Configuration _generalConf;

private Configuration _autoTuningConf;
private List<AggregatorConfigurationData> _aggregatorConfData;

private final Map<String, ApplicationType> _nameToType = new HashMap<String, ApplicationType>();
private final Map<ApplicationType, List<Heuristic>> _typeToHeuristics = new HashMap<ApplicationType, List<Heuristic>>();
private final Map<ApplicationType, HadoopMetricsAggregator> _typeToAggregator = new HashMap<ApplicationType, HadoopMetricsAggregator>();
private final Map<ApplicationType, List<Heuristic>> _typeToHeuristics =
new HashMap<ApplicationType, List<Heuristic>>();
private final Map<ApplicationType, HadoopMetricsAggregator> _typeToAggregator =
new HashMap<ApplicationType, HadoopMetricsAggregator>();
private final Map<ApplicationType, ElephantFetcher> _typeToFetcher = new HashMap<ApplicationType, ElephantFetcher>();
private final Map<String, Html> _heuristicToView = new HashMap<String, Html>();
private Map<ApplicationType, List<JobType>> _appTypeToJobTypes = new HashMap<ApplicationType, List<JobType>>();
Expand All @@ -96,20 +101,24 @@ private ElephantContext() {
loadConfiguration();
}

public Configuration getAutoTuningConf() {
return _autoTuningConf;
}

private void loadConfiguration() {
loadAggregators();
loadFetchers();
loadHeuristics();
loadJobTypes();

loadGeneralConf();
loadAutoTuningConf();

// It is important to configure supported types in the LAST step so that we could have information from all
// configurable components.
configureSupportedApplicationTypes();
}


private void loadAggregators() {
Document document = Utils.loadXMLDoc(AGGREGATORS_CONF);

Expand All @@ -119,8 +128,8 @@ private void loadAggregators() {
Class<?> aggregatorClass = Class.forName(data.getClassName());
Object instance = aggregatorClass.getConstructor(AggregatorConfigurationData.class).newInstance(data);
if (!(instance instanceof HadoopMetricsAggregator)) {
throw new IllegalArgumentException(
"Class " + aggregatorClass.getName() + " is not an implementation of " + HadoopMetricsAggregator.class.getName());
throw new IllegalArgumentException("Class " + aggregatorClass.getName() + " is not an implementation of "
+ HadoopMetricsAggregator.class.getName());
}

ApplicationType type = data.getAppType();
Expand All @@ -145,6 +154,7 @@ private void loadAggregators() {
}

}

/**
* Load all the fetchers configured in FetcherConf.xml
*/
Expand All @@ -157,8 +167,8 @@ private void loadFetchers() {
Class<?> fetcherClass = Class.forName(data.getClassName());
Object instance = fetcherClass.getConstructor(FetcherConfigurationData.class).newInstance(data);
if (!(instance instanceof ElephantFetcher)) {
throw new IllegalArgumentException(
"Class " + fetcherClass.getName() + " is not an implementation of " + ElephantFetcher.class.getName());
throw new IllegalArgumentException("Class " + fetcherClass.getName() + " is not an implementation of "
+ ElephantFetcher.class.getName());
}

ApplicationType type = data.getAppType();
Expand Down Expand Up @@ -198,8 +208,8 @@ private void loadHeuristics() {

Object instance = heuristicClass.getConstructor(HeuristicConfigurationData.class).newInstance(data);
if (!(instance instanceof Heuristic)) {
throw new IllegalArgumentException(
"Class " + heuristicClass.getName() + " is not an implementation of " + Heuristic.class.getName());
throw new IllegalArgumentException("Class " + heuristicClass.getName() + " is not an implementation of "
+ Heuristic.class.getName());
}
ApplicationType type = data.getAppType();
List<Heuristic> heuristics = _typeToHeuristics.get(type);
Expand Down Expand Up @@ -249,9 +259,8 @@ private void loadHeuristics() {
}

// Bind No_DATA heuristic to its helper pages, no need to add any real configurations
_heuristicsConfData.add(
new HeuristicConfigurationData(HeuristicResult.NO_DATA.getHeuristicName(),
HeuristicResult.NO_DATA.getHeuristicClassName(), "views.html.help.helpNoData", null, null));
_heuristicsConfData.add(new HeuristicConfigurationData(HeuristicResult.NO_DATA.getHeuristicName(),
HeuristicResult.NO_DATA.getHeuristicClassName(), "views.html.help.helpNoData", null, null));
}

/**
Expand Down Expand Up @@ -308,6 +317,16 @@ private void loadGeneralConf() {
_generalConf.addResource(this.getClass().getClassLoader().getResourceAsStream(GENERAL_CONF));
}

/**
* Load in the AutoTuningConf.xml file as a configuration object for other objects to access
*/
private void loadAutoTuningConf() {
logger.info("Loading configuration file " + AUTO_TUNING_CONF);

_autoTuningConf = new Configuration();
_autoTuningConf.addResource(this.getClass().getClassLoader().getResourceAsStream(AUTO_TUNING_CONF));
}

/**
* Given an application type, return the currently bound heuristics
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@
* the License.
*/

package com.linkedin.drelephant.exceptions;
package com.linkedin.drelephant.clients;

import java.io.File;
import java.util.Map;
import java.util.Set;

import com.linkedin.drelephant.exceptions.JobState;
import com.linkedin.drelephant.exceptions.LoggingEvent;


/**
* The interface WorkflowClient should be implemented by all the workflow client. The client should not
Expand Down
Loading

0 comments on commit b2c24b8

Please sign in to comment.