-
Notifications
You must be signed in to change notification settings - Fork 279
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add Prometheus instrumentation Closes #3214 * Fix missing bind argument * Run Prometheus exporter as a separate service * Expose number of streaming requests and number of streamed entities as metrics * Expose number of auth attempts as Prometheus metrics * Update Helm chart to expose metrics endpoints, setup ServiceMonitors * Handle requests without Authz object gracefully * Rename Prometheus label to "api_endpoint" to prevent naming clashes Prometheus Operator also uses the "endpoint" label and automatically renames "endpoint" labels exposed by the metrics endpoint to "exported_endpoints" which is ugly. * Add xref metrics * Use common prefix for all metric names Even though it is considered an anti-pattern to add a prefix with the name of the software or component to metrics (according to the official Prometheus documentation), I have decided to add a prefix. I’ve found that this makes it much easier to find relevant metrics. The main disadvantage of per-component prefixes queries become slightly more complex if you want to query the same metric (e.g. HTTP request duration) across multiple components. This isn’t super important in our case though, so I think the trade-off is acceptable. * Expose Python platform information as Prometheus metrics * Remove unused port, network policy from K8s specs Although I'm not 100% sure, the exposed port 3000 probably is a left-over from the past, possibly when convert-document was still part of ingest-file. The network policy prevented Prometheus from scraping ingest-file metrics (and as the metrics port is now the only port exposed by ingest-file, should be otherwise unnecessary). * Use keyword args to set Prometheus metric labels As suggested by @stchris * Bump servicelayer from 1.22.0 to 1.22.1 * Simplify entity streaming metrics code There’s no need to do batched metric increments until this becomes a performance bottleneck. * Limit maximum size of Prometheus multiprocessing directory * Do not let collector classes inherit from `object` I copied the boilerplate for custom collectors from the docs without thinking about it too much, but inheriting from `object` really isn’t necessary anymore in Python 3. The Prometheus client also exports an abstract `Collector` class -- it doesn’t do anything except providing type hints for the `collect` method which is nice. * Add `aleph_` prefix to Prometheus API metrics * Fix metrics name (singular -> plural) * Add documentation on how to test Prometheus instrumentation in local Kubernetes cluster
- Loading branch information
1 parent
3827d97
commit e5eba0d
Showing
24 changed files
with
906 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
from sqlalchemy import func | ||
from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily | ||
from prometheus_client.registry import Collector | ||
from followthemoney import __version__ as ftm_version | ||
|
||
from aleph import __version__ as aleph_version | ||
from aleph.core import create_app as create_flask_app | ||
from aleph.queues import get_active_dataset_status | ||
from aleph.model import Role, Collection, EntitySet, Bookmark | ||
|
||
|
||
class InfoCollector(Collector): | ||
def collect(self): | ||
yield InfoMetricFamily( | ||
"aleph_system", | ||
"Aleph system information", | ||
value={ | ||
"aleph_version": aleph_version, | ||
"ftm_version": ftm_version, | ||
}, | ||
) | ||
|
||
|
||
class DatabaseCollector(Collector): | ||
def __init__(self): | ||
self._flask_app = create_flask_app() | ||
|
||
def collect(self): | ||
with self._flask_app.app_context(): | ||
yield self._users() | ||
yield self._collections() | ||
yield self._collection_users() | ||
yield self._entitysets() | ||
yield self._entityset_users() | ||
yield self._bookmarks() | ||
yield self._bookmark_users() | ||
|
||
def _users(self): | ||
return GaugeMetricFamily( | ||
"aleph_users", | ||
"Total number of users", | ||
value=Role.all_users().count(), | ||
) | ||
|
||
def _collections(self): | ||
gauge = GaugeMetricFamily( | ||
"aleph_collections", | ||
"Total number of collections by category", | ||
labels=["category"], | ||
) | ||
|
||
query = ( | ||
Collection.all() | ||
.with_entities(Collection.category, func.count()) | ||
.group_by(Collection.category) | ||
) | ||
|
||
for category, count in query: | ||
gauge.add_metric([category], count) | ||
|
||
return gauge | ||
|
||
def _collection_users(self): | ||
gauge = GaugeMetricFamily( | ||
"aleph_collection_users", | ||
"Total number of users that have created at least one collection", | ||
labels=["category"], | ||
) | ||
|
||
query = ( | ||
Collection.all() | ||
.with_entities( | ||
Collection.category, | ||
func.count(func.distinct(Collection.creator_id)), | ||
) | ||
.group_by(Collection.category) | ||
) | ||
|
||
for category, count in query: | ||
gauge.add_metric([category], count) | ||
|
||
return gauge | ||
|
||
def _entitysets(self): | ||
gauge = GaugeMetricFamily( | ||
"aleph_entitysets", | ||
"Total number of entity set by type", | ||
labels=["type"], | ||
) | ||
|
||
query = ( | ||
EntitySet.all() | ||
.with_entities(EntitySet.type, func.count()) | ||
.group_by(EntitySet.type) | ||
) | ||
|
||
for entityset_type, count in query: | ||
gauge.add_metric([entityset_type], count) | ||
|
||
return gauge | ||
|
||
def _entityset_users(self): | ||
gauge = GaugeMetricFamily( | ||
"aleph_entityset_users", | ||
"Number of users that have created at least on entity set of the given type", | ||
labels=["type"], | ||
) | ||
|
||
query = ( | ||
EntitySet.all() | ||
.with_entities( | ||
EntitySet.type, | ||
func.count(func.distinct(EntitySet.role_id)), | ||
) | ||
.group_by(EntitySet.type) | ||
) | ||
|
||
for entityset_type, count in query: | ||
gauge.add_metric([entityset_type], count) | ||
|
||
return gauge | ||
|
||
def _bookmarks(self): | ||
return GaugeMetricFamily( | ||
"aleph_bookmarks", | ||
"Total number of bookmarks", | ||
value=Bookmark.query.count(), | ||
) | ||
|
||
def _bookmark_users(self): | ||
return GaugeMetricFamily( | ||
"aleph_bookmark_users", | ||
"Number of users that have created at least one bookmark", | ||
value=Bookmark.query.distinct(Bookmark.role_id).count(), | ||
) | ||
|
||
|
||
class QueuesCollector(Collector): | ||
def collect(self): | ||
status = get_active_dataset_status() | ||
|
||
yield GaugeMetricFamily( | ||
"aleph_active_datasets", | ||
"Total number of active datasets", | ||
value=status["total"], | ||
) | ||
|
||
stages = {} | ||
|
||
for collection_status in status["datasets"].values(): | ||
for job_status in collection_status["jobs"]: | ||
for stage_status in job_status["stages"]: | ||
stage = stage_status["stage"] | ||
pending = stage_status["pending"] | ||
running = stage_status["running"] | ||
|
||
if stage not in stages: | ||
stages[stage] = { | ||
"pending": 0, | ||
"running": 0, | ||
} | ||
|
||
stages[stage] = { | ||
"pending": stages[stage].get("pending") + pending, | ||
"running": stages[stage].get("running") + running, | ||
} | ||
|
||
tasks_gauge = GaugeMetricFamily( | ||
"aleph_tasks", | ||
"Total number of pending or running tasks in a given stage", | ||
labels=["stage", "status"], | ||
) | ||
|
||
for stage, tasks in stages.items(): | ||
tasks_gauge.add_metric([stage, "pending"], tasks["pending"]) | ||
tasks_gauge.add_metric([stage, "running"], tasks["running"]) | ||
|
||
yield tasks_gauge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from prometheus_client import make_wsgi_app, PLATFORM_COLLECTOR | ||
from prometheus_client.core import CollectorRegistry | ||
|
||
from aleph.metrics.collectors import InfoCollector, DatabaseCollector, QueuesCollector | ||
|
||
|
||
def create_app(): | ||
registry = CollectorRegistry() | ||
registry.register(PLATFORM_COLLECTOR) | ||
registry.register(InfoCollector()) | ||
registry.register(DatabaseCollector()) | ||
registry.register(QueuesCollector()) | ||
|
||
return make_wsgi_app(registry=registry) | ||
|
||
|
||
app = create_app() |
Oops, something went wrong.