Skip to content

Commit

Permalink
feat(api): Add /health endpoint (#28)
Browse files Browse the repository at this point in the history
* fix(validations): move Prometheus /runtimeinfo API call under validations

Signed-off-by: hayk96 <hayko5999@gmail.com>

* feat(api): Add /health endpoint

Signed-off-by: hayk96 <hayko5999@gmail.com>

* docs: Update CHANGELOG.md

Signed-off-by: hayk96 <hayko5999@gmail.com>

* chore(api): Bump app version #patch

Signed-off-by: hayk96 <hayko5999@gmail.com>

---------

Signed-off-by: hayk96 <hayko5999@gmail.com>
  • Loading branch information
hayk96 authored Jun 8, 2024
1 parent 4daa8bb commit 4374aa2
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 16 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## 0.3.2 / 2024-06-08

* [ENHANCEMENT] Added a new endpoint: `/health` for retrieving system health. #28
* [ENHANCEMENT] Added a new function that continuously checks (600 checks at 3-second intervals) for establishing a connection to Prometheus.
* [BUGFIX] The Prometheus /runtimeinfo API call check has been moved under the validation function.
* [BUGFIX] Added proper exception handling while checking the status of the reload API of Prometheus at runtime.

## 0.3.1 / 2024-06-01

* [ENHANCEMENT] Added a new webpage, Metrics Management, based on the `/metrics-lifecycle-policies` API. This feature allows
Expand Down
8 changes: 4 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
prom_addr, rule_path = args.get("prom.addr"), args.get("rule.path")
host, port = args.get("web.listen_address").split(":")

if not all([settings.check_prom_http_connection(prom_addr),
settings.check_reload_api_status(prom_addr),
settings.check_rules_directory(rule_path),
settings.check_fs_permissions(rule_path)]):
if not all([settings.check_rules_directory(rule_path),
settings.check_fs_permissions(rule_path),
settings.establish_prom_connection(prom_addr),
settings.check_reload_api_status(prom_addr)]):
sys.exit()


Expand Down
3 changes: 2 additions & 1 deletion src/api/v1/api.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from .. v1.endpoints import reverse_proxy, rules, policies, web
from .. v1.endpoints import reverse_proxy, rules, policies, web, health
from fastapi import APIRouter

api_router = APIRouter()
api_router.include_router(rules.router, prefix="/api/v1")
api_router.include_router(policies.router, prefix="/api/v1")
api_router.include_router(web.router, prefix="")
api_router.include_router(health.router, prefix="")
api_router.add_route("/{path:path}", reverse_proxy._reverse_proxy, ["GET", "POST", "PUT"])
50 changes: 50 additions & 0 deletions src/api/v1/endpoints/health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from src.utils.settings import check_prom_readiness
from fastapi import APIRouter, Response, status
from src.utils.arguments import arg_parser

router = APIRouter()
rule_path = arg_parser().get("rule.path")
prom_addr = arg_parser().get("prom.addr")


@router.get("/health",
name="Get system health",
description="Returns a 200 status when the prometheus-api is able to connect to the Prometheus server",
status_code=status.HTTP_200_OK,
tags=["health"],
responses={
200: {
"description": "OK",
"content": {
"application/json": {
"example": [
{
"status": "success",
"message": "Service is up and running"
}
]
}
}
},
503: {
"description": "Service Unavailable",
"content": {
"application/json": {
"example": [
{
"status": "error",
"message": "Service is unavailable due to a health-check failure"
}
]
}
}
}
})
async def health(response: Response):
global prom_addr
if not check_prom_readiness(prom_addr):
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
return {"status": "error",
"message": "Service is unavailable due to a health-check failure"}
return {"status": "success",
"message": "Service is up and running"}
4 changes: 2 additions & 2 deletions src/core/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
rule_path = arg_parser().get("rule.path")
prom_addr = arg_parser().get("prom.addr")
policies_data_file = ".policies.json"
prom_storage_retention_human = prom_info(
prom_addr, "/runtimeinfo")["data"]["storageRetention"]


def sync_to_file(data) -> None:
Expand Down Expand Up @@ -105,6 +103,8 @@ def validate_duration(val) -> tuple[bool, int, str, str, int]:
This function compares the value of the 'keep_for'
field with the retention time of the Prometheus server
"""
prom_storage_retention_human = prom_info(
prom_addr, "/runtimeinfo")["data"]["storageRetention"]
prom_storage_retention_seconds = parse(prom_storage_retention_human)
val_seconds = parse(val)
if val_seconds >= prom_storage_retention_seconds:
Expand Down
2 changes: 1 addition & 1 deletion src/utils/openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def openapi(app: FastAPI):
"providing additional features and addressing its limitations. "
"Running as a sidecar alongside the Prometheus server enables "
"users to extend the capabilities of the API.",
version="0.3.1",
version="0.3.2",
contact={
"name": "Hayk Davtyan",
"url": "https://hayk96.github.io",
Expand Down
37 changes: 29 additions & 8 deletions src/utils/settings.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from os import remove, path
from .log import logger
from time import sleep
import requests


def check_prom_http_connection(prometheus_address) -> bool:
def check_prom_readiness(prometheus_address) -> bool:
"""Checks the connection to the Prometheus server over HTTP."""
try:
r = requests.get(f"{prometheus_address}/-/ready")
Expand All @@ -20,15 +21,35 @@ def check_prom_http_connection(prometheus_address) -> bool:
return False


def establish_prom_connection(prometheus_address, retries=600) -> bool:
"""
This function continuously checks the
connection to the Prometheus server, waiting
for it to establish. The total wait time is
30 minutes (600 checks at 3-second intervals)
"""
for i in range(retries):
if check_prom_readiness(prometheus_address):
return True
sleep(3)
logger.error(
"Connection to Prometheus failed: Maximum retry attempts exceeded. The server has been shut down.")
return False


def check_reload_api_status(prometheus_address) -> bool:
"""Checks the status of the Prometheus Management API."""
r = requests.post(f"{prometheus_address}/-/reload")
if r.status_code == 403:
logger.error(
f"{r.text} It's disabled by default and can be enabled via the --web.enable-lifecycle. "
f"See https://prometheus.io/docs/prometheus/latest/management_api/#reload for more details.")
return False
return True
try:
r = requests.post(f"{prometheus_address}/-/reload")
except requests.exceptions.ConnectionError as e:
logger.error(e)
else:
if r.status_code == 403:
logger.error(
f"{r.text} It's disabled by default and can be enabled via the --web.enable-lifecycle. "
f"See https://prometheus.io/docs/prometheus/latest/management_api/#reload for more details.")
return False
return True


def check_rules_directory(prometheus_rules_dir) -> bool:
Expand Down

0 comments on commit 4374aa2

Please sign in to comment.