Skip to content

Commit

Permalink
fix(api): Fix metrics lifecycle policies tasks (#29)
Browse files Browse the repository at this point in the history
* chore(api): Add endpoint for triggering MLP / Run MLP every 2 hours

* chore(api): Bump app version #patch

---------

Signed-off-by: hayk96 <hayko5999@gmail.com>
  • Loading branch information
hayk96 authored Jun 16, 2024
1 parent 4374aa2 commit 8e30efb
Show file tree
Hide file tree
Showing 12 changed files with 138 additions and 47 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.venv/
.idea/
tests/
docs/
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

# User defined files
docs/examples/docker/rules
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## 0.3.3 / 2024-06-16

* [ENHANCEMENT] Added a new endpoint: `/metrics-lifecycle-policies/trigger` for force-triggering all Metrics Lifecycle Policies. #29
* [CHANGE] Changed the execution interval of the task "Clean-up Prometheus series" by the scheduler to 2 hours (previously 20 minutes).
* [BUGFIX] Prevented the execution of more than one task at the same time, as tasks can remain in the running state for longer than their execution interval.

## 0.3.2 / 2024-06-08

* [ENHANCEMENT] Added a new endpoint: `/health` for retrieving system health. #28
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
FROM python:3.10-alpine
LABEL maintainer="Hayk Davtyan <hayko5999@gmail.com>"
ENV TZ UTC
WORKDIR app
COPY . .
RUN python -m pip install --no-cache-dir -r requirements.txt
Expand Down
65 changes: 61 additions & 4 deletions src/api/v1/endpoints/policies.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from src.models.policy import MetricsLifecyclePolicyCreate, MetricsLifecyclePolicyUpdate
from fastapi import APIRouter, Response, Request, Body, status
from apscheduler.triggers.date import DateTrigger
from src.utils.scheduler import schedule
from src.core import policies as mlp
from src.utils.log import logger
from datetime import datetime
from typing import Annotated

router = APIRouter()
Expand All @@ -20,7 +23,7 @@
"application/json": {
"example": [
{
"match": "{__name__=~'go_.+'}",
"match": "{__name__=~'go_.*'}",
"keep_for": "7d",
"description": "This metrics lifecycle policy keeps GoLang metrics for 7 days"
}
Expand Down Expand Up @@ -78,7 +81,7 @@ async def get_policy(
"example": [
{
"GoLang Policy": {
"match": "{__name__=~'go_.+'}",
"match": "{__name__=~'go_.*'}",
"keep_for": "7d",
"message": "This policy keeps GoLang metrics for 7 days"
},
Expand Down Expand Up @@ -109,8 +112,8 @@ async def get_policies(


@router.post("/metrics-lifecycle-policies",
name="Create metric lifecycle policy",
description="Creates a new metric lifecycle policy",
name="Create metrics lifecycle policy",
description="Creates a new metrics lifecycle policy",
status_code=status.HTTP_201_CREATED,
tags=["metrics-lifecycle-policies"],
responses={
Expand Down Expand Up @@ -354,3 +357,57 @@ async def delete(
return {
"status": sts,
"message": msg} if response.status_code != 204 else response.status_code


@router.post("/metrics-lifecycle-policies/trigger",
name="Trigger metrics lifecycle policies",
description="Force triggers all new metrics lifecycle policies",
status_code=status.HTTP_202_ACCEPTED,
tags=["metrics-lifecycle-policies"],
responses={
202: {
"description": "Accepted",
"content": {
"application/json": {
"example": [
{
"status": "success",
"message": "Your request has been accepted for processing"
}
]
}
}
},
409: {
"description": "Conflict",
"content": {
"application/json": {
"example": [
{
"status": "error",
"message": "Cannot create a new task. Server is currently processing another task"
}
]
}
}
},
}
)
async def trigger(request: Request, response: Response):
from src.tasks.policies import running_tasks
if not running_tasks:
schedule(trigger=DateTrigger(run_date=datetime.now()))
response.status_code, sts, msg = 202, "success", "Request has been accepted for processing"
else:
response.status_code, sts, msg = 409, "error", \
"Cannot create a new task. Server is currently processing another task"
logger.info(
msg=msg,
extra={
"status": response.status_code,
"method": request.method,
"request_path": request.url.path})
return {
"status": sts,
"message": msg
}
2 changes: 1 addition & 1 deletion src/models/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class MetricsLifecyclePolicyCreate(BaseModel, extra=Extra.allow):
"description": "Time-series matching with regex will be kept for 7 days",
"value": {
"name": "Example Policy",
"match": "{__name__=~'go_.+'}",
"match": "{__name__=~'go_.*'}",
"keep_for": "7d",
"description": "Time-series matching with regex will be kept for 7 days."
}
Expand Down
89 changes: 56 additions & 33 deletions src/tasks/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,85 @@
from src.utils.arguments import arg_parser
from src.utils.log import logger
from pytimeparse2 import parse
from time import time
import requests
import time


prom_addr = arg_parser().get("prom.addr")
running_tasks = False


def delete_series(policy_name: str, policy: dict) -> None:
def delete_series(policy_name: str, policy: dict) -> bool:
"""
This function calls two Prometheus endpoints:
* POST /api/v1/admin/tsdb/delete_series
* POST /api/v1/admin/tsdb/clean_tombstones
This function calls following Prometheus endpoint:
POST /api/v1/admin/tsdb/delete_series
User-defined policies passed to this function
perform cleanup based on the specified policy settings.
perform clean-up based on the specified policy settings.
"""
time_range = time.time() - parse(policy["keep_for"])
start_time = time.time()
time_range = time() - parse(policy["keep_for"])
try:
r = requests.post(
f'{prom_addr}/api/v1/admin/tsdb/delete_series?match[]={policy["match"]}&end={time_range}')
except BaseException as e:
logger.error(e, extra={"policy_name": policy_name})
else:
if r.status_code != 204:
logger.error(f"Failed to delete series, {r.json().get('error')}", extra={
"status": r.status_code, "policy_name": policy_name})
return
try:
r = requests.post(
f'{prom_addr}/api/v1/admin/tsdb/clean_tombstones')
except BaseException as e:
logger.error(e, extra={"policy_name": policy_name})
return
else:
if r.status_code != 204:
logger.error(f"Failed to clean tombstones, {r.json().get('error')}", extra={
"status": r.status_code, "policy_name": policy_name})
return
exec_time = float("{:.2f}".format(time.time() - start_time))
logger.debug("Task cleanup time-series has been successfully completed",
extra={"policy_name": policy_name, "exec_time": exec_time})
return


def task_run_policies():
if r.status_code == 204:
logger.debug("Task clean-up time-series has been successfully completed",
extra={"policy_name": policy_name})
return True
logger.error(f"Failed to delete series, {r.json().get('error')}", extra={
"status": r.status_code, "policy_name": policy_name})
return False


def clean_tombstones() -> bool:
"""
This function calls following Prometheus endpoint:
POST /api/v1/admin/tsdb/clean_tombstones
Removes the deleted data from disk and
cleans up the existing tombstones
"""
try:
r = requests.post(
f'{prom_addr}/api/v1/admin/tsdb/clean_tombstones')
except BaseException as e:
logger.error(e)
else:
if r.status_code == 204:
return True
logger.error(f"Failed to clean tombstones, {r.json().get('error')}", extra={
"status": r.status_code})
return False


def run_policies() -> bool:
"""
This function loops over user-defined metrics lifecycle
policies and executes the cleanup job one by one
policies and executes the clean-up job one by one
"""
global running_tasks
if running_tasks:
logger.warning(
"Cannot create a new task. Server is currently processing another task")
return False

policies = load_policies()
if policies:
logger.debug(
f"Found {len(policies)} metrics lifecycle {'policies' if len(policies) > 1 else 'policy'}. "
f"Starting job to cleanup time-series.")
f"Starting job to clean-up time-series.")
running_tasks = True
start_time = time()
for p in policies:
logger.debug(
"Task cleanup time-series is in progress", extra={
"Task clean-up series is in progress", extra={
"policy_name": p, "match": policies[p]["match"],
"keep_for": policies[p]["keep_for"]})
delete_series(policy_name=p, policy=policies[p])
clean_tombstones()
exec_time = float("{:.2f}".format(time() - start_time))
running_tasks = False
logger.debug(
"Task clean-up series has been completed", extra={
"duration": exec_time})
return True
2 changes: 1 addition & 1 deletion src/utils/openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def openapi(app: FastAPI):
"providing additional features and addressing its limitations. "
"Running as a sidecar alongside the Prometheus server enables "
"users to extend the capabilities of the API.",
version="0.3.2",
version="0.3.3",
contact={
"name": "Hayk Davtyan",
"url": "https://hayk96.github.io",
Expand Down
12 changes: 6 additions & 6 deletions src/utils/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.interval import IntervalTrigger
from src.tasks.policies import task_run_policies
from src.tasks.policies import run_policies
import atexit


def schedule():
def schedule(trigger=IntervalTrigger(hours=2)):
scheduler = BackgroundScheduler()
scheduler.start()
scheduler.add_job(
func=task_run_policies,
trigger=IntervalTrigger(minutes=20),
name='Schedule task "cleanup time-series" every 20 minutes',
replace_existing=True
func=run_policies,
trigger=trigger,
replace_existing=True,
name="Clean-up Prometheus time-series"
)
atexit.register(lambda: scheduler.shutdown())
1 change: 1 addition & 0 deletions ui/homepage/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Extended HTTP API service for Prometheus</title>
<link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
<style>
body, h1, ul, li, a {
margin: 0;
Expand Down
1 change: 1 addition & 0 deletions ui/metrics-management/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Metrics Management</title>
<link rel="stylesheet" href="/metrics-management/style.css">
<link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
</head>
<body>
<div id="sidebar" class="sidebar">
Expand Down
3 changes: 2 additions & 1 deletion ui/rules-management/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Edit Rule</title>
<title>Rules Management</title>
<link rel="stylesheet" href="/rules-management/style.css">
<link rel="icon" type="image/png" href="https://raw.githubusercontent.com/hayk96/prometheus-api/main/docs/images/logo.png">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.2/codemirror.min.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.2/theme/monokai.min.css">
</head>
Expand Down

0 comments on commit 8e30efb

Please sign in to comment.