From 459c18ddf204bfe79b658776a959b562dc46a0fc Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 24 Aug 2023 09:03:55 -0700 Subject: [PATCH 1/3] In a previous commit, the detection of a failure became too aggressive. This remediates this by considering a run 'failed' if the hb hasn't been updated within heartbeat_cutoff time as opposed to the heartbeat_threshold time --- services/ui_backend_service/data/db/tables/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/ui_backend_service/data/db/tables/run.py b/services/ui_backend_service/data/db/tables/run.py index db698918..ff2e6c41 100644 --- a/services/ui_backend_service/data/db/tables/run.py +++ b/services/ui_backend_service/data/db/tables/run.py @@ -130,7 +130,7 @@ def select_columns(self): WHEN end_attempt_ok IS NOT NULL AND end_attempt_ok.value IS FALSE THEN 'failed' WHEN {table_name}.last_heartbeat_ts IS NOT NULL - AND @(extract(epoch from now())-{table_name}.last_heartbeat_ts)<={heartbeat_threshold} + AND @(extract(epoch from now())-{table_name}.last_heartbeat_ts)<={heartbeat_cutoff} THEN 'running' ELSE 'failed' END) AS status From a1781578e8c4ae381538c84676d2a0e1fdc8e5b7 Mon Sep 17 00:00:00 2001 From: Sakari Ikonen Date: Tue, 5 Sep 2023 17:37:40 +0300 Subject: [PATCH 2/3] change run finished at query to heartbeat_cutoff from threshold --- services/ui_backend_service/data/db/tables/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/ui_backend_service/data/db/tables/run.py b/services/ui_backend_service/data/db/tables/run.py index ff2e6c41..f804b602 100644 --- a/services/ui_backend_service/data/db/tables/run.py +++ b/services/ui_backend_service/data/db/tables/run.py @@ -111,7 +111,7 @@ def select_columns(self): WHEN end_attempt_ok IS NOT NULL THEN end_attempt_ok.ts_epoch WHEN {table_name}.last_heartbeat_ts IS NOT NULL - AND @(extract(epoch from now())-{table_name}.last_heartbeat_ts)<={heartbeat_threshold} + AND @(extract(epoch from now())-{table_name}.last_heartbeat_ts)<={heartbeat_cutoff} THEN NULL ELSE {table_name}.last_heartbeat_ts*1000 END) AS finished_at From a8c67774ce636f8a4a2ed7f12f1ce4eb80e96d30 Mon Sep 17 00:00:00 2001 From: Sakari Ikonen Date: Tue, 5 Sep 2023 17:40:42 +0300 Subject: [PATCH 3/3] clean up unused values from run query --- services/ui_backend_service/data/db/tables/run.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/services/ui_backend_service/data/db/tables/run.py b/services/ui_backend_service/data/db/tables/run.py index f804b602..1cb97cfa 100644 --- a/services/ui_backend_service/data/db/tables/run.py +++ b/services/ui_backend_service/data/db/tables/run.py @@ -117,7 +117,6 @@ def select_columns(self): END) AS finished_at """.format( table_name=table_name, - heartbeat_threshold=HEARTBEAT_THRESHOLD, heartbeat_cutoff=RUN_INACTIVE_CUTOFF_TIME, ), """ @@ -136,8 +135,6 @@ def select_columns(self): END) AS status """.format( table_name=table_name, - heartbeat_threshold=HEARTBEAT_THRESHOLD, - cutoff=OLD_RUN_FAILURE_CUTOFF_TIME, heartbeat_cutoff=RUN_INACTIVE_CUTOFF_TIME, ), """ @@ -157,7 +154,6 @@ def select_columns(self): END) AS duration """.format( table_name=table_name, - heartbeat_threshold=HEARTBEAT_THRESHOLD, cutoff=OLD_RUN_FAILURE_CUTOFF_TIME, ), ]