Add prometheus metrics to track pushkin things (#88)

* time waiting for locks * number of requests in flight
matrix-org · Mar 24, 2020 · 699aff8 · 699aff8
1 parent 9b1a1dd
commit 699aff8
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 16 deletions.
diff --git a/changelog.d/88.feature b/changelog.d/88.feature
@@ -0,0 +1 @@
+Add prometheus metrics to track pushkin things.
diff --git a/sygnal/apnspushkin.py b/sygnal/apnspushkin.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Copyright 2014 OpenMarket Ltd
 # Copyright 2017 Vector Creations Ltd
-# Copyright 2019 The Matrix.org Foundation C.I.C.
+# Copyright 2019-2020 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 import aioapns
 from aioapns import APNs, NotificationRequest
 from opentracing import logs, tags
-from prometheus_client import Histogram, Counter
+from prometheus_client import Histogram, Counter, Gauge
 from twisted.internet.defer import Deferred
 
 from sygnal import apnstruncate
@@ -41,6 +41,10 @@
     "sygnal_apns_request_time", "Time taken to send HTTP request to APNS"
 )
 
+ACTIVE_REQUESTS_GAUGE = Gauge(
+    "sygnal_active_apns_requests", "Number of APNS requests in flight"
+)
+
 RESPONSE_STATUS_CODES_COUNTER = Counter(
     "sygnal_apns_status_codes",
     "Number of HTTP response status codes received from APNS",
@@ -148,8 +152,10 @@ async def _dispatch_request(self, log, span, device, shaved_payload, prio):
         )
 
         try:
-            with SEND_TIME_HISTOGRAM.time():
-                response = await self._send_notification(request)
+
+            with ACTIVE_REQUESTS_GAUGE.track_inprogress():
+                with SEND_TIME_HISTOGRAM.time():
+                    response = await self._send_notification(request)
         except aioapns.ConnectionError:
             raise TemporaryNotificationDispatchException("aioapns Connection Failure")
 

diff --git a/sygnal/gcmpushkin.py b/sygnal/gcmpushkin.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Copyright 2014 Leon Handreke
 # Copyright 2017 New Vector Ltd
-# Copyright 2019 The Matrix.org Foundation C.I.C.
+# Copyright 2019-2020 The Matrix.org Foundation C.I.C.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,24 +21,37 @@
 from json import JSONDecodeError
 
 from opentracing import logs, tags
-from prometheus_client import Histogram, Counter
+from prometheus_client import Counter, Gauge, Histogram
 from twisted.internet.defer import DeferredSemaphore
-from twisted.web.client import HTTPConnectionPool, Agent, FileBodyProducer, readBody
+from twisted.web.client import Agent, FileBodyProducer, HTTPConnectionPool, readBody
 from twisted.web.http_headers import Headers
 
 from sygnal.exceptions import (
-    TemporaryNotificationDispatchException,
     NotificationDispatchException,
+    TemporaryNotificationDispatchException,
 )
-from sygnal.utils import twisted_sleep, NotificationLoggerAdapter
 from sygnal.helper.context_factory import ClientTLSOptionsFactory
+from sygnal.utils import NotificationLoggerAdapter, twisted_sleep
+
 from .exceptions import PushkinSetupException
 from .notifications import Pushkin
 
+QUEUE_TIME_HISTOGRAM = Histogram(
+    "sygnal_gcm_queue_time", "Time taken waiting for a connection to GCM"
+)
+
 SEND_TIME_HISTOGRAM = Histogram(
     "sygnal_gcm_request_time", "Time taken to send HTTP request to GCM"
 )
 
+PENDING_REQUESTS_GAUGE = Gauge(
+    "sygnal_pending_gcm_requests", "Number of GCM requests waiting for a connection"
+)
+
+ACTIVE_REQUESTS_GAUGE = Gauge(
+    "sygnal_active_gcm_requests", "Number of GCM requests in flight"
+)
+
 RESPONSE_STATUS_CODES_COUNTER = Counter(
     "sygnal_gcm_status_codes",
     "Number of HTTP response status codes received from GCM",
@@ -145,12 +158,20 @@ async def _perform_http_request(self, body, headers):
         # we use the semaphore to actually limit the number of concurrent
         # requests, since the HTTPConnectionPool will actually just lead to more
         # requests being created but not pooled – it does not perform limiting.
-        await self.connection_semaphore.acquire()
+        with QUEUE_TIME_HISTOGRAM.time():
+            with PENDING_REQUESTS_GAUGE.track_inprogress():
+                await self.connection_semaphore.acquire()
+
         try:
-            response = await self.http_agent.request(
-                b"POST", GCM_URL, headers=Headers(headers), bodyProducer=body_producer
-            )
-            response_text = (await readBody(response)).decode()
+            with SEND_TIME_HISTOGRAM.time():
+                with ACTIVE_REQUESTS_GAUGE.track_inprogress():
+                    response = await self.http_agent.request(
+                        b"POST",
+                        GCM_URL,
+                        headers=Headers(headers),
+                        bodyProducer=body_producer,
+                    )
+                    response_text = (await readBody(response)).decode()
         except Exception as exception:
             raise TemporaryNotificationDispatchException(
                 "GCM request failure"
@@ -164,8 +185,7 @@ async def _request_dispatch(self, n, log, body, headers, pushkeys, span):
 
         failed = []
 
-        with SEND_TIME_HISTOGRAM.time():
-            response, response_text = await self._perform_http_request(body, headers)
+        response, response_text = await self._perform_http_request(body, headers)
 
         RESPONSE_STATUS_CODES_COUNTER.labels(
             pushkin=self.name, code=response.code