-
Notifications
You must be signed in to change notification settings - Fork 24
/
check_rule_counts
executable file
·130 lines (102 loc) · 5.6 KB
/
check_rule_counts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
# Copyright 2012-2024 CERN
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Authors:
# - Donata Mielaikaite, <donata.mielaikaite@cern.ch>, 2020
# - Eric Vaandering, <ewv@fnal.gov>, 2021
# - Fernando Garzon, ogarzonm@cern.ch, 2022
# - Hasan Ozturk, haozturk AT cern DOT ch, 2024
"""
Probe to check rules.
"""
import datetime
import json
import sys
import traceback
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
from rucio.common.config import config_get
from rucio.db.sqla import models
from rucio.db.sqla.constants import (RuleState)
from rucio.db.sqla.session import get_session
from rucio.db.sqla.util import get_count
from sqlalchemy import func
from utils import common
PrometheusPusher = common.PrometheusPusher
probe_metrics = common.probe_metrics
# Exit statuses
OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3
if __name__ == '__main__':
try:
session = get_session()
states = {'REPLICATING': RuleState.REPLICATING,
'OK': RuleState.OK,
'INJECT': RuleState.INJECT,
'STUCK': RuleState.STUCK,
'SUSPENDED': RuleState.SUSPENDED,
'WAITING_APPROVAL': RuleState.WAITING_APPROVAL }
# Number of days which will be used to aggregate rules/locks
older_than_n_days = [1, 7, 21, 90, 180, 365]
with PrometheusPusher() as manager:
# Rule count per state
result = (session.query(models.ReplicationRule.state, func.count(models.ReplicationRule.state))
.group_by(models.ReplicationRule.state)
.with_hint(models.ReplicationRule, 'INDEX_FFS(rules RULES_PK)', 'oracle')
.all())
for state, num in result:
manager.gauge(name='rule_count_per_state.{state}',
documentation='Number of rules in a given state').labels(state=str(state.name)).set(num)
# Rule count per state and activity
results = (session.query(models.ReplicationRule.state, models.ReplicationRule.activity, func.count(models.ReplicationRule.state))
.group_by(models.ReplicationRule.activity, models.ReplicationRule.state)
.with_hint(models.ReplicationRule, 'INDEX_FFS(rules RULES_PK)', 'oracle')
.all())
for result in results:
manager.gauge(name='rule_count_per_state_and_activity.{state}.{activity}',
documentation='Number of rules in a given state and activity').labels(state=result[0].name, activity=result[1]).set(result[2])
# Lock count per state (STUCK and REPLICATING) and activity
results = (session.query(models.ReplicationRule.activity,
func.sum(models.ReplicationRule.locks_stuck_cnt),
func.sum(models.ReplicationRule.locks_replicating_cnt))
.group_by(models.ReplicationRule.activity)
.with_hint(models.ReplicationRule, 'INDEX_FFS(rules RULES_PK)', 'oracle')
.all())
for result in results:
manager.gauge(name='lock_count_per_state_and_activity.{state}.{activity}',
documentation='Number of S/R locks by activity and state').labels(state='STUCK', activity=result[0]).set(result[1])
manager.gauge(name='lock_count_per_state_and_activity.{state}.{activity}',
documentation='Number of S/R locks by activity and state').labels(state='REPLICATING', activity=result[0]).set(result[2])
for nDays in older_than_n_days:
age = datetime.datetime.utcnow() - datetime.timedelta(days=nDays)
for stateName, stateDB in states.items():
# Rule count per state and date
query = (session.query(func.count(models.ReplicationRule.id))
.filter(models.ReplicationRule.state == stateDB)
.filter(models.ReplicationRule.created_at <= age))
result = query.scalar() or 0
manager.gauge(name='rule_count_per_state_and_date.{state}.{older_than_days}',
documentation='Rule count per state and date').labels(state=stateName, older_than_days= nDays ).set(result)
# File count per state and date
query = (session.query(func.sum(models.ReplicationRule.locks_stuck_cnt))
.filter(models.ReplicationRule.state == stateDB)
.filter(models.ReplicationRule.created_at <= age))
result = query.scalar() or 0
manager.gauge(name='file_count_per_state_and_date.{state}.{older_than_days}',
documentation='File count per state and date').labels(state=stateName, older_than_days=nDays).set(result)
except:
print(traceback.format_exc())
sys.exit(UNKNOWN)
finally:
session.remove()
sys.exit(OK)