-
Notifications
You must be signed in to change notification settings - Fork 2.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Datadog contrib for monitoring purpose
Datadog is a tool that allows you to send metrics that you create dashboard and add alerting on specific behaviors. Adding this contrib will allow for users of this tool to log their pipeline information to Datadog. Based on the status change of a task, we log that information to Datadog with the parameters that were used to run that specific task. This allow us to easily create dashboard to visualize the health. For example, we can be notified via Datadog if a task has failed, or we can graph the execution time of a specific task over a period of time. The implementation idea was strongly based on the stale PR #2044.
- Loading branch information
Showing
3 changed files
with
179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from luigi import parameter | ||
from luigi.metrics import MetricsCollector | ||
from luigi.task import Config | ||
|
||
from datadog import initialize, api, statsd | ||
|
||
|
||
class datadog(Config): | ||
api_key = parameter.Parameter(default='dummy_api_key', description='API key provided by Datadog') | ||
app_key = parameter.Parameter(default='dummy_app_key', description='APP key provided by Datadog') | ||
default_tags = parameter.Parameter(default='application:luigi', description='Default tags for every events and metrics sent to Datadog') | ||
environment = parameter.Parameter(default='development', description="Environment of which the pipeline is ran from (eg: 'production', 'staging', ...") | ||
metric_namespace = parameter.Parameter(default='luigi', description="Default namespace for events and metrics (eg: 'luigi' for 'luigi.task.started')") | ||
statsd_host = parameter.Parameter(default='localhost', description='StatsD host implementing the Datadog service') | ||
statsd_port = parameter.IntParameter(default=8125, description='StatsD port implementing the Datadog service') | ||
|
||
|
||
class DatadogMetricsCollector(MetricsCollector): | ||
def __init__(self, *args, **kwargs): | ||
super(DatadogMetricsCollector, self).__init__(*args, **kwargs) | ||
self._config = datadog(**kwargs) | ||
|
||
initialize(api_key=self._config.api_key, | ||
app_key=self._config.app_key, | ||
statsd_host=self._config.statsd_host, | ||
statsd_port=self._config.statsd_port) | ||
|
||
def handle_task_started(self, task): | ||
title = "Luigi: A task has been started!" | ||
text = "A task has been started in the pipeline named: {name}".format(name=task.family) | ||
tags = ["task_name:{name}".format(name=task.family)] + self._format_task_params_to_tags(task) | ||
|
||
self.send_increment('task.started', tags=tags) | ||
|
||
event_tags = tags + ["task_state:STARTED"] | ||
self.send_event(title=title, text=text, tags=event_tags, alert_type='info', priority='low') | ||
|
||
def handle_task_failed(self, task): | ||
title = "Luigi: A task has failed!" | ||
text = "A task has failed in the pipeline named: {name}".format(name=task.family) | ||
tags = ["task_name:{name}".format(name=task.family)] + self._format_task_params_to_tags(task) | ||
|
||
self.send_increment('task.failed', tags=tags) | ||
|
||
event_tags = tags + ["task_state:FAILED"] | ||
self.send_event(title=title, text=text, tags=event_tags, alert_type='error', priority='normal') | ||
|
||
def handle_task_disabled(self, task, config): | ||
title = "Luigi: A task has been disabled!" | ||
text = """A task has been disabled in the pipeline named: {name}. | ||
The task has failed {failures} times in the last {window} | ||
seconds, so it is being disabled for {persist} seconds.""".format( | ||
name=task.family, | ||
persist=config.disable_persist, | ||
failures=task.retry_policy.retry_count, | ||
window=config.disable_window | ||
) | ||
tags = ["task_name:{name}".format(name=task.family)] + self._format_task_params_to_tags(task) | ||
|
||
self.send_increment('task.disabled', tags=tags) | ||
|
||
event_tags = tags + ["task_state:DISABLED"] | ||
self.send_event(title=title, text=text, tags=event_tags, alert_type='error', priority='normal') | ||
|
||
def handle_task_done(self, task): | ||
# The task is already done -- Let's not re-create an event | ||
if task.time_running is None: | ||
return | ||
|
||
title = "Luigi: A task has been completed!" | ||
text = "A task has completed in the pipeline named: {name}".format(name=task.family) | ||
tags = ["task_name:{name}".format(name=task.family)] + self._format_task_params_to_tags(task) | ||
|
||
time_elapse = task.updated - task.time_running | ||
|
||
self.send_increment('task.done', tags=tags) | ||
self.send_gauge('task.execution_time', time_elapse, tags=tags) | ||
|
||
event_tags = tags + ["task_state:DONE"] | ||
self.send_event(title=title, text=text, tags=event_tags, alert_type='info', priority='low') | ||
|
||
def send_event(self, title=None, text=None, tags=[], alert_type='info', priority='normal'): | ||
all_tags = tags + self.default_tags() | ||
|
||
api.Event.create(title=title, text=text, tags=all_tags, alert_type=alert_type, priority=priority) | ||
|
||
def send_gauge(self, metric_name, value, tags=[]): | ||
all_tags = tags + self.default_tags() | ||
|
||
namespaced_metric = "{namespace}.{metric_name}".format(namespace=self._config.metric_namespace, | ||
metric_name=metric_name) | ||
statsd.gauge(namespaced_metric, value, tags=all_tags) | ||
|
||
def send_increment(self, metric_name, value=1, tags=[]): | ||
all_tags = tags + self.default_tags() | ||
|
||
namespaced_metric = "{namespace}.{metric_name}".format(namespace=self._config.metric_namespace, | ||
metric_name=metric_name) | ||
statsd.increment(namespaced_metric, value, tags=all_tags) | ||
|
||
def _format_task_params_to_tags(self, task): | ||
params = [] | ||
for key, value in task.params.items(): | ||
params.append("{key}:{value}".format(key=key, value=value)) | ||
|
||
return params | ||
|
||
def default_tags(self): | ||
default_tags = [] | ||
|
||
env_tag = "environment:{environment}".format(environment=self._config.environment) | ||
default_tags.append(env_tag) | ||
|
||
if self._config.default_tags: | ||
default_tags = default_tags + str.split(self._config.default_tags, ',') | ||
|
||
return default_tags |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
class MetricsCollector(object): | ||
"""Dummy MetricsCollecter base class that can be replace by tool specific | ||
implementation. | ||
""" | ||
def __init__(self): | ||
pass | ||
|
||
def handle_task_started(self, task): | ||
pass | ||
|
||
def handle_task_failed(self, task): | ||
pass | ||
|
||
def handle_task_disabled(self, task, config): | ||
pass | ||
|
||
def handle_task_done(self, task): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters