From f53a7065384c8cbe310da6beb10765e9d117087d Mon Sep 17 00:00:00 2001 From: jbannister Date: Mon, 25 Apr 2022 16:26:26 +0100 Subject: [PATCH] Adding prometheus metrics to webapp whenever a failure / success is found by the report_hunter --- notebooker/execute_notebook.py | 1 - notebooker/utils/notebook_execution.py | 2 +- notebooker/web/report_hunter.py | 31 +++++++++++--- notebooker/web/routes/prometheus.py | 20 +++++++++ tests/integration/test_report_hunter.py | 54 +++++++++++++++++++++++++ 5 files changed, 101 insertions(+), 7 deletions(-) diff --git a/notebooker/execute_notebook.py b/notebooker/execute_notebook.py index 750ba0b4..cf922831 100644 --- a/notebooker/execute_notebook.py +++ b/notebooker/execute_notebook.py @@ -39,7 +39,6 @@ def _run_checks( generate_pdf_output: Optional[bool] = True, hide_code: Optional[bool] = False, mailto: Optional[str] = "", - error_mailto: Optional[str] = "", email_subject: Optional[str] = "", prepare_only: Optional[bool] = False, notebooker_disable_git: bool = False, diff --git a/notebooker/utils/notebook_execution.py b/notebooker/utils/notebook_execution.py index 8926a905..b81d9933 100644 --- a/notebooker/utils/notebook_execution.py +++ b/notebooker/utils/notebook_execution.py @@ -3,7 +3,7 @@ import shutil import tempfile from logging import getLogger -from typing import AnyStr, Union +from typing import Union from notebooker.constants import TEMPLATE_DIR_SEPARATOR, NotebookResultComplete, NotebookResultError from notebooker.utils.mail import mail diff --git a/notebooker/web/report_hunter.py b/notebooker/web/report_hunter.py index 1aabc88d..7d9b8629 100644 --- a/notebooker/web/report_hunter.py +++ b/notebooker/web/report_hunter.py @@ -11,7 +11,23 @@ logger = getLogger(__name__) -def _report_hunter(webapp_config: WebappConfig, run_once: bool = False, timeout: int = 5): +def try_register_success_prometheus(report_name: str, report_title: str): + try: + from notebooker.web.routes.prometheus import record_successful_report + record_successful_report(report_name, report_title) + except ImportError as e: + logger.info(f"Attempted to log success to prometheus but failed with ImportError({e}).") + + +def try_register_fail_prometheus(report_name: str, report_title: str): + try: + from notebooker.web.routes.prometheus import record_failed_report + record_failed_report(report_name, report_title) + except ImportError as e: + logger.info(f"Attempted to log failure to prometheus but failed with ImportError({e}).") + + +def _report_hunter(webapp_config: WebappConfig, run_once: bool = False, timeout: int = 120): """ This is a function designed to run in a thread alongside the webapp. It updates the cache which the web app reads from and performs some admin on pending/running jobs. The function terminates either when @@ -21,12 +37,13 @@ def _report_hunter(webapp_config: WebappConfig, run_once: bool = False, timeout: :param run_once: Whether to infinitely run this function or not. :param timeout: - The time in seconds that we cache results. + The time in seconds that we cache results. Defaults to 120s. :param serializer_kwargs: Any kwargs which are required for a Serializer to be initialised successfully. """ serializer = initialize_serializer_from_config(webapp_config) last_query = None + refresh_period_seconds = 10 while not os.getenv("NOTEBOOKER_APP_STOPPING"): try: ct = 0 @@ -51,8 +68,8 @@ def _report_hunter(webapp_config: WebappConfig, run_once: bool = False, timeout: "Please try again! Timed out after {:.0f} minutes " "{:.0f} seconds.".format(delta_seconds / 60, delta_seconds % 60), ) - # Finally, check we have the latest updates - _last_query = datetime.datetime.now() - datetime.timedelta(minutes=1) + # Finally, check we have the latest updates with a small buffer + _last_query = datetime.datetime.now() - datetime.timedelta(seconds=refresh_period_seconds) query_results = serializer.get_all_results(since=last_query) for result in query_results: ct += 1 @@ -61,6 +78,10 @@ def _report_hunter(webapp_config: WebappConfig, run_once: bool = False, timeout: set_report_cache( result.report_name, result.job_id, result, timeout=timeout, cache_dir=webapp_config.CACHE_DIR ) + if result.status == JobStatus.DONE: + try_register_success_prometheus(result.report_name, result.report_title) + if result.status == JobStatus.ERROR: + try_register_fail_prometheus(result.report_name, result.report_title) logger.info( "Report-hunter found a change for {} (status: {}->{})".format( result.job_id, existing.status if existing else None, result.status @@ -74,5 +95,5 @@ def _report_hunter(webapp_config: WebappConfig, run_once: bool = False, timeout: logger.exception(str(e)) if run_once: break - time.sleep(10) + time.sleep(refresh_period_seconds) logger.info("Report-hunting thread successfully killed.") diff --git a/notebooker/web/routes/prometheus.py b/notebooker/web/routes/prometheus.py index dc40a9ba..3799eddb 100644 --- a/notebooker/web/routes/prometheus.py +++ b/notebooker/web/routes/prometheus.py @@ -18,6 +18,18 @@ registry=REGISTRY, labelnames=["env", "method", "path", "http_status", "hostname"], ) +N_SUCCESSFUL_REPORTS = Counter( + "notebooker_n_successful_reports", + "Number of successful runs in the current session for the report", + registry=REGISTRY, + labelnames=["report_name", "report_title"], +) +N_FAILED_REPORTS = Counter( + "notebooker_n_failed_reports", + "Number of failed runs in the current session for the report", + registry=REGISTRY, + labelnames=["report_name", "report_title"], +) prometheus_bp = Blueprint("prometheus", __name__) @@ -39,6 +51,14 @@ def record_request_data(response): return response +def record_successful_report(report_name, report_title): + N_SUCCESSFUL_REPORTS.labels(report_name, report_title).inc() + + +def record_failed_report(report_name, report_title): + N_FAILED_REPORTS.labels(report_name, report_title).inc() + + def setup_metrics(app): app.before_request(start_timer) # The order here matters since we want stop_timer diff --git a/tests/integration/test_report_hunter.py b/tests/integration/test_report_hunter.py index f83f4b4e..14ba4120 100644 --- a/tests/integration/test_report_hunter.py +++ b/tests/integration/test_report_hunter.py @@ -2,6 +2,7 @@ import uuid import freezegun +import mock.mock import pytest from notebooker.constants import JobStatus, NotebookResultComplete, NotebookResultError, NotebookResultPending @@ -173,3 +174,56 @@ def test_report_hunter_pending_to_done(bson_library, webapp_config): serializer.save_check_result(expected) _report_hunter(webapp_config=webapp_config, run_once=True) assert get_report_cache(report_name, job_id, cache_dir=webapp_config.CACHE_DIR) == expected + + +@mock.patch("notebooker.web.routes.prometheus.record_failed_report") +def test_prometheus_logging_in_report_hunter_no_prometheus_fail(record_failed_report, bson_library, webapp_config): + job_id = str(uuid.uuid4()) + report_name = str(uuid.uuid4()) + serializer = initialize_serializer_from_config(webapp_config) + record_failed_report.side_effect = ImportError("wah") + + with freezegun.freeze_time(datetime.datetime(2018, 1, 12, 2, 37)): + expected = NotebookResultError( + job_id=job_id, + report_name=report_name, + report_title=report_name, + status=JobStatus.ERROR, + update_time=datetime.datetime(2018, 1, 12, 2, 37), + job_start_time=datetime.datetime(2018, 1, 12, 2, 30), + error_info="This was cancelled!", + ) + serializer.save_check_result(expected) + _report_hunter(webapp_config=webapp_config, run_once=True) + assert get_report_cache(report_name, job_id, cache_dir=webapp_config.CACHE_DIR) == expected + record_failed_report.assert_called_once_with(report_name, report_name) + + +@mock.patch("notebooker.web.routes.prometheus.record_successful_report") +def test_prometheus_logging_in_report_hunter_no_prometheus_success( + record_successful_report, bson_library, webapp_config +): + job_id = str(uuid.uuid4()) + report_name = str(uuid.uuid4()) + serializer = initialize_serializer_from_config(webapp_config) + record_successful_report.side_effect = ImportError("wah") + + with freezegun.freeze_time(datetime.datetime(2018, 1, 12, 2, 37)): + expected = NotebookResultComplete( + job_id=job_id, + report_name=report_name, + report_title=report_name, + status=JobStatus.DONE, + update_time=datetime.datetime(2018, 1, 12, 2, 37), + job_start_time=datetime.datetime(2018, 1, 12, 2, 30), + job_finish_time=datetime.datetime(2018, 1, 12, 2, 37), + pdf=b"abc", + raw_html="rawstuff", + email_html="emailstuff", + raw_html_resources={"outputs": {}, "inlining": []}, + raw_ipynb_json="[]", + ) + serializer.save_check_result(expected) + _report_hunter(webapp_config=webapp_config, run_once=True) + assert get_report_cache(report_name, job_id, cache_dir=webapp_config.CACHE_DIR) == expected + record_successful_report.assert_called_once_with(report_name, report_name)