From c3cb22bf3673b981390285e71ea9b410d7d26cb6 Mon Sep 17 00:00:00 2001 From: David Teller Date: Thu, 5 Jan 2023 08:37:54 +0100 Subject: [PATCH] Very basic support for OpenMetrics (aka Prometheus) (#442) This PR: - creates an OpenMetrics server that enables collecting performance data from this process by e.g. a Prometheus server; - exposes as metrics the performance of http requests with MatrixBot. Further metrics may of course be added. --- config/harness.yaml | 13 ++++ package.json | 1 + src/Mjolnir.ts | 7 +- src/appservice/AppService.ts | 6 ++ src/appservice/cli.ts | 3 + src/appservice/config/config.ts | 36 +++++++++ src/config.ts | 69 +++++++++++++++++ src/index.ts | 5 +- src/utils.ts | 54 +++++++++++++- src/webapis/OpenMetrics.ts | 102 ++++++++++++++++++++++++++ src/webapis/WebAPIs.ts | 6 +- test/integration/fixtures.ts | 1 + test/integration/mjolnirSetupUtils.ts | 5 +- test/integration/openMetricsTest.ts | 43 +++++++++++ tsconfig.json | 1 + 15 files changed, 342 insertions(+), 10 deletions(-) create mode 100644 src/webapis/OpenMetrics.ts create mode 100644 test/integration/openMetricsTest.ts diff --git a/config/harness.yaml b/config/harness.yaml index 18b992fa..e6fd77ab 100644 --- a/config/harness.yaml +++ b/config/harness.yaml @@ -177,6 +177,19 @@ health: # Defaults to 418. unhealthyStatus: 418 + openMetrics: + # Whether openMetrics should be enabled (default false, activated for tests) + enabled: true + + # The port to expose the webserver on. Defaults to 8081. + port: 9090 + + # The address to listen for requests on. Defaults to all addresses. + address: "0.0.0.0" + + # The path to expose the monitoring endpoint at. Defaults to `/metrics` + endpoint: "/metrics" + # Options for exposing web APIs. web: # Whether to enable web APIs. diff --git a/package.json b/package.json index b517b8c1..6a13d102 100644 --- a/package.json +++ b/package.json @@ -58,6 +58,7 @@ "matrix-appservice-bridge": "8.0.0", "parse-duration": "^1.0.2", "pg": "^8.8.0", + "prom-client": "^14.1.0", "shell-quote": "^1.7.3", "ulidx": "^0.3.0", "yaml": "^2.1.1" diff --git a/src/Mjolnir.ts b/src/Mjolnir.ts index eae577bf..1a764e96 100644 --- a/src/Mjolnir.ts +++ b/src/Mjolnir.ts @@ -38,6 +38,7 @@ import { ProtectionManager } from "./protections/ProtectionManager"; import { RoomMemberManager } from "./RoomMembers"; import ProtectedRoomsConfig from "./ProtectedRoomsConfig"; import { MatrixEmitter, MatrixSendClient } from "./MatrixEmitter"; +import { OpenMetrics } from "./webapis/OpenMetrics"; export const STATE_NOT_STARTED = "not_started"; export const STATE_CHECKING_PERMISSIONS = "checking_permissions"; @@ -64,6 +65,7 @@ export class Mjolnir { private protectedRoomsConfig: ProtectedRoomsConfig; public readonly protectedRoomsTracker: ProtectedRoomsSet; private webapis: WebAPIs; + private openMetrics: OpenMetrics; public taskQueue: ThrottlingQueue; /** * Reporting back to the management room. @@ -233,6 +235,7 @@ export class Mjolnir { if (config.pollReports) { this.reportPoller = new ReportPoller(this, this.reportManager); } + this.openMetrics = new OpenMetrics(this.config); // Setup join/leave listener this.roomJoins = new RoomMemberManager(this.matrixEmitter); this.taskQueue = new ThrottlingQueue(this, config.backgroundDelayMS); @@ -261,6 +264,7 @@ export class Mjolnir { * Start Mjölnir. */ public async start() { + LogService.info("Mjolnir", "Starting Mjolnir instance"); try { // Start the web server. console.log("Starting web server"); @@ -279,7 +283,7 @@ export class Mjolnir { } this.reportPoller.start(reportPollSetting.from); } - + await this.openMetrics.start(); // Load the state. this.currentState = STATE_CHECKING_PERMISSIONS; @@ -330,6 +334,7 @@ export class Mjolnir { this.matrixEmitter.stop(); this.webapis.stop(); this.reportPoller?.stop(); + this.openMetrics.stop(); } /** diff --git a/src/appservice/AppService.ts b/src/appservice/AppService.ts index 7669ad51..57f4d40e 100644 --- a/src/appservice/AppService.ts +++ b/src/appservice/AppService.ts @@ -20,6 +20,7 @@ import { DataStore, PgDataStore } from ".//datastore"; import { Api } from "./Api"; import { IConfig } from "./config/config"; import { AccessControl } from "./AccessControl"; +import { OpenMetrics } from "../webapis/OpenMetrics"; const log = new Logger("AppService"); /** @@ -29,6 +30,7 @@ const log = new Logger("AppService"); export class MjolnirAppService { private readonly api: Api; + private readonly openMetrics: OpenMetrics; /** * The constructor is private because we want to ensure intialization steps are followed, @@ -42,6 +44,7 @@ export class MjolnirAppService { private readonly dataStore: DataStore, ) { this.api = new Api(config.homeserver.url, mjolnirManager); + this.openMetrics = new OpenMetrics(config); } /** @@ -144,6 +147,8 @@ export class MjolnirAppService { this.api.start(this.config.webAPI.port); await this.bridge.listen(port); log.info("MjolnirAppService started successfully"); + + await this.openMetrics.start(); } /** @@ -153,6 +158,7 @@ export class MjolnirAppService { await this.bridge.close(); await this.dataStore.close(); await this.api.close(); + this.openMetrics.stop(); } /** diff --git a/src/appservice/cli.ts b/src/appservice/cli.ts index 9afd6735..762b8080 100644 --- a/src/appservice/cli.ts +++ b/src/appservice/cli.ts @@ -1,6 +1,7 @@ import { Cli } from "matrix-appservice-bridge"; import { MjolnirAppService } from "./AppService"; import { IConfig } from "./config/config"; +import * as utils from "../utils"; /** * This file provides the entrypoint for the appservice mode for mjolnir. @@ -20,6 +21,8 @@ const cli = new Cli({ if (config === null) { throw new Error("Couldn't load config"); } + utils.initializeSentry(config); + utils.initializeGlobalPerformanceMetrics(config); await MjolnirAppService.run(port, config, cli.getRegistrationFilePath()); } }); diff --git a/src/appservice/config/config.ts b/src/appservice/config/config.ts index 969b4fc2..aff4829a 100644 --- a/src/appservice/config/config.ts +++ b/src/appservice/config/config.ts @@ -39,6 +39,42 @@ export interface IConfig { accessControlList: string, /** configuration for matrix-appservice-bridge's Logger */ logging?: LoggingOpts, + health?: { + // If specified, attempt to upload any crash statistics to sentry. + sentry?: { + dsn: string; + + // Frequency of performance monitoring. + // + // A number in [0.0, 1.0], where 0.0 means "don't bother with tracing" + // and 1.0 means "trace performance at every opportunity". + tracesSampleRate: number; + }; + openMetrics?: { + /** + * If `true`, expose a web server for server metrics, e.g. performance. + * + * Intended to be used with Prometheus or another Open Metrics scrapper. + */ + enabled: boolean; + /** + * The port on which to expose server metrics. + */ + port: number; + /** + * The path at which to collect health metrics. + * + * If unspecified, use `"/metrics"`. + */ + endpoint: string; + /** + * If specified, only serve this address mask. + * + * If unspecified, use 0.0.0.0 (accessible by any host). + */ + address: string; + } + } } export function read(configPath: string): IConfig { diff --git a/src/config.ts b/src/config.ts index c716aa8c..00e27eab 100644 --- a/src/config.ts +++ b/src/config.ts @@ -19,6 +19,45 @@ import { load } from "js-yaml"; import { MatrixClient, LogService } from "matrix-bot-sdk"; import Config from "config"; +export interface IHealthConfig { + health?: { + // If specified, attempt to upload any crash statistics to sentry. + sentry?: { + dsn: string; + + // Frequency of performance monitoring. + // + // A number in [0.0, 1.0], where 0.0 means "don't bother with tracing" + // and 1.0 means "trace performance at every opportunity". + tracesSampleRate: number; + }; + openMetrics?: { + /** + * If `true`, expose a web server for server metrics, e.g. performance. + * + * Intended to be used with Prometheus or another Open Metrics scrapper. + */ + enabled: boolean; + /** + * The port on which to expose server metrics. + */ + port: number; + /** + * The path at which to collect health metrics. + * + * If unspecified, use `"/metrics"`. + */ + endpoint: string; + /** + * If specified, only serve this address mask. + * + * If unspecified, use 0.0.0.0 (accessible by any host). + */ + address: string; + } + } +} + /** * The configuration, as read from production.yaml * @@ -99,6 +138,30 @@ export interface IConfig { // and 1.0 means "trace performance at every opportunity". tracesSampleRate: number; }; + openMetrics?: { + /** + * If `true`, expose a web server for server metrics, e.g. performance. + * + * Intended to be used with Prometheus or another Open Metrics scrapper. + */ + enabled: boolean; + /** + * The port on which to expose server metrics. + */ + port: number; + /** + * The path at which to collect health metrics. + * + * If unspecified, use `"/metrics"`. + */ + endpoint: string; + /** + * If specified, only serve this address mask. + * + * If unspecified, use 0.0.0.0 (accessible by any host). + */ + address: string; + } }; web: { enabled: boolean; @@ -167,6 +230,12 @@ const defaultConfig: IConfig = { healthyStatus: 200, unhealthyStatus: 418, }, + openMetrics: { + enabled: false, + port: 9090, + address: "0.0.0.0", + endpoint: "/metrics", + } }, web: { enabled: false, diff --git a/src/index.ts b/src/index.ts index 67237bcf..d4e3531a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -29,7 +29,7 @@ import { import { read as configRead } from "./config"; import { Mjolnir } from "./Mjolnir"; -import { initializeSentry, patchMatrixClient } from "./utils"; +import { initializeSentry, initializeGlobalPerformanceMetrics, patchMatrixClient } from "./utils"; (async function () { @@ -46,6 +46,9 @@ import { initializeSentry, patchMatrixClient } from "./utils"; if (config.health.sentry) { initializeSentry(config); } + if (config.health.openMetrics?.enabled) { + initializeGlobalPerformanceMetrics(config); + } const healthz = new Healthz(config); healthz.isHealthy = false; // start off unhealthy if (config.health.healthz.enabled) { diff --git a/src/utils.ts b/src/utils.ts index fd9d62fb..c3c59919 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -25,9 +25,10 @@ import { ClientRequest, IncomingMessage } from "http"; import { default as parseDuration } from "parse-duration"; import * as Sentry from '@sentry/node'; import * as _ from '@sentry/tracing'; // Performing the import activates tracing. +import { collectDefaultMetrics, Counter, Histogram, register } from "prom-client"; import ManagementRoomOutput from "./ManagementRoomOutput"; -import { IConfig } from "./config"; +import { IHealthConfig } from "./config"; import { MatrixSendClient } from "./MatrixEmitter"; // Define a few aliases to simplify parsing durations. @@ -401,17 +402,62 @@ export function patchMatrixClient() { patchMatrixClientForRetry(); } +/** + * Initialize performance measurements for the matrix client. + * + * This method is idempotent. If `config` specifies that Open Metrics + * should not be used, it does nothing. + */ +export function initializeGlobalPerformanceMetrics(config: IHealthConfig) { + if (isGlobalPerformanceMetricsCollectorInitialized || !config.health?.openMetrics?.enabled) { + return; + } + + // Collect the Prometheus-recommended metrics. + collectDefaultMetrics({ register }); + + // Collect matrix-bot-sdk-related metrics. + let originalRequestFn = getRequestFn(); + let perfHistogram = new Histogram({ + name: "mjolnir_performance_http_request", + help: "Duration of HTTP requests in seconds", + }); + let successfulRequestsCounter = new Counter({ + name: "mjolnir_status_api_request_pass", + help: "Number of successful API requests", + }); + let failedRequestsCounter = new Counter({ + name: "mjolnir_status_api_request_fail", + help: "Number of failed API requests", + }); + setRequestFn(async (params: { [k: string]: any }, cb: any) => { + let timer = perfHistogram.startTimer(); + return await originalRequestFn(params, function(error: object, response: any, body: string) { + // Stop timer before calling callback. + timer(); + if (error) { + failedRequestsCounter.inc(); + } else { + successfulRequestsCounter.inc(); + } + cb(error, response, body); + }); + }); + isGlobalPerformanceMetricsCollectorInitialized = true; +} +let isGlobalPerformanceMetricsCollectorInitialized = false; + /** * Initialize Sentry for error monitoring and reporting. * * This method is idempotent. If `config` specifies that Sentry * should not be used, it does nothing. */ -export function initializeSentry(config: IConfig) { +export function initializeSentry(config: IHealthConfig) { if (sentryInitialized) { return; } - if (config.health.sentry) { + if (config.health?.sentry) { // Configure error monitoring with Sentry. let sentry = config.health.sentry; Sentry.init({ @@ -423,4 +469,4 @@ export function initializeSentry(config: IConfig) { } // Set to `true` once we have initialized `Sentry` to ensure // that we do not attempt to initialize it more than once. -let sentryInitialized = false; \ No newline at end of file +let sentryInitialized = false; diff --git a/src/webapis/OpenMetrics.ts b/src/webapis/OpenMetrics.ts new file mode 100644 index 00000000..2bd3d430 --- /dev/null +++ b/src/webapis/OpenMetrics.ts @@ -0,0 +1,102 @@ +/* +Copyright 2022 The Matrix.org Foundation C.I.C. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +import { Server } from "http"; +import express from "express"; +import { LogService } from "matrix-bot-sdk"; +import { IHealthConfig } from "../config"; +import { collectDefaultMetrics, register } from "prom-client"; + +export class OpenMetrics { + private webController: express.Express = express(); + private httpServer?: Server; + + constructor(private readonly config: IHealthConfig) { + // Setup JSON parsing. + this.webController.use(express.json()); + } + + /** + * Start accepting requests to the OpenMetrics API. + * + * Does nothing if openMetrics is disabled in the config. + */ + public async start() { + if (!this.config.health?.openMetrics?.enabled) { + LogService.info("OpenMetrics server is disabled."); + return; + } + // Make sure that we collect the Prometheus-recommended metrics. + try { + collectDefaultMetrics({ register }); + } catch (ex) { + if (ex.message.startsWith("A metric with the name")) { + // `collectDefaultMetrics` throws this error if it is called + // more than once in the same process, as is the case during + // testing. + // + // Sadly, `register.clear()`, which should be designed to + // prevent this, seems to work asynchronously and non-deterministically, + // sometimes not clearing the register at all by the time we re-register + // default metrics and sometimes clearing metrics that we haven't registered + // yet. + // + // Just ignore this error. + } else { + throw ex; + } + } + + LogService.info("Starting OpenMetrics server."); + this.httpServer = this.webController.listen(this.config.health.openMetrics!.port, this.config.health.openMetrics!.address); + this.webController.options(this.config.health.openMetrics!.address, async (_request, response) => { + // reply with CORS options + response.header("Access-Control-Allow-Origin", "*"); + response.header("Access-Control-Allow-Headers", "X-Requested-With, Content-Type, Authorization, Date"); + response.header("Access-Control-Allow-Methods", "POST, OPTIONS"); + response.status(200); + return response.send(); + }); + // Respond to Prometheus collection. + LogService.info(`configuring GET ${this.config.health.openMetrics.endpoint}`); + this.webController.get(this.config.health.openMetrics.endpoint, async (_request, response) => { + // set CORS headers for the response + response.header("Access-Control-Allow-Origin", "*"); + response.header("Access-Control-Allow-Headers", "X-Requested-With, Content-Type, Authorization, Date"); + response.header("Access-Control-Allow-Methods", "POST, OPTIONS"); + try { + response.set('Content-Type', register.contentType); + response.end(await register.metrics()); + } catch (ex) { + response.status(500).end(ex); + } + }); + LogService.info(`configuring GET ${this.config.health.openMetrics.endpoint}... DONE`); + LogService.info("OpenMetrics server ready."); + } + + public stop() { + if (this.httpServer) { + LogService.info("Stopping OpenMetrics server."); + this.httpServer.close(); + this.httpServer = undefined; + } + } + + public get isEnabled(): boolean { + return !!this.httpServer + } +} diff --git a/src/webapis/WebAPIs.ts b/src/webapis/WebAPIs.ts index 3b22e658..e65da08b 100644 --- a/src/webapis/WebAPIs.ts +++ b/src/webapis/WebAPIs.ts @@ -49,7 +49,7 @@ export class WebAPIs { // configure /report API. if (this.config.web.abuseReporting.enabled) { - console.log(`configuring ${API_PREFIX}/report/:room_id/:event_id...`); + LogService.info(`configuring ${API_PREFIX}/report/:room_id/:event_id...`); this.webController.options(`${API_PREFIX}/report/:room_id/:event_id`, async (request, response) => { // reply with CORS options response.header("Access-Control-Allow-Origin", "*"); @@ -66,7 +66,7 @@ export class WebAPIs { response.header("Access-Control-Allow-Methods", "POST, OPTIONS"); await this.handleReport({ request, response, roomId: request.params.room_id, eventId: request.params.event_id }) }); - console.log(`configuring ${API_PREFIX}/report/:room_id/:event_id... DONE`); + LogService.info(`configuring ${API_PREFIX}/report/:room_id/:event_id... DONE`); } // configure ruleServer API. @@ -88,7 +88,7 @@ export class WebAPIs { public stop() { if (this.httpServer) { - console.log("Stopping WebAPIs."); + LogService.info("Stopping WebAPIs."); this.httpServer.close(); this.httpServer = undefined; } diff --git a/test/integration/fixtures.ts b/test/integration/fixtures.ts index d19c0aca..921bd3bd 100644 --- a/test/integration/fixtures.ts +++ b/test/integration/fixtures.ts @@ -1,5 +1,6 @@ import { read as configRead } from "../../src/config"; import { makeMjolnir, teardownManagementRoom } from "./mjolnirSetupUtils"; +import { register } from "prom-client"; // When Mjolnir starts (src/index.ts) it clobbers the config by resolving the management room // alias specified in the config (config.managementRoom) and overwriting that with the room ID. diff --git a/test/integration/mjolnirSetupUtils.ts b/test/integration/mjolnirSetupUtils.ts index 866c040d..1082c606 100644 --- a/test/integration/mjolnirSetupUtils.ts +++ b/test/integration/mjolnirSetupUtils.ts @@ -21,9 +21,10 @@ import { LogLevel, RichConsoleLogger } from "matrix-bot-sdk"; + import { Mjolnir} from '../../src/Mjolnir'; import { overrideRatelimitForUser, registerUser } from "./clientHelper"; -import { initializeSentry, patchMatrixClient } from "../../src/utils"; +import { initializeGlobalPerformanceMetrics, initializeSentry, patchMatrixClient } from "../../src/utils"; import { IConfig } from "../../src/config"; /** @@ -51,6 +52,8 @@ export async function ensureAliasedRoomExists(client: MatrixClient, alias: strin async function configureMjolnir(config: IConfig) { // Initialize error monitoring as early as possible. initializeSentry(config); + initializeGlobalPerformanceMetrics(config); + try { await registerUser(config.homeserverUrl, config.pantalaimon.username, config.pantalaimon.username, config.pantalaimon.password, true) } catch (e) { diff --git a/test/integration/openMetricsTest.ts b/test/integration/openMetricsTest.ts new file mode 100644 index 00000000..fb1d57d2 --- /dev/null +++ b/test/integration/openMetricsTest.ts @@ -0,0 +1,43 @@ +import { strict as assert } from "assert"; +import { getRequestFn } from "matrix-bot-sdk"; + +import { IConfig } from "../../src/config"; + +async function fetchMetrics(config: IConfig): Promise { + if (!config.health.openMetrics?.enabled) { + throw new TypeError("openMetrics deactivated, cannot fetch"); + } + let uri = new URL("http://localhost"); + uri.port = `${config.health.openMetrics!.port}`; + uri.pathname = config.health.openMetrics!.endpoint; + return await new Promise((resolve, reject) => + getRequestFn()({ + method: "GET", + uri + }, (error: object, _response: any, body: string) => { + if (error) { + reject(error); + } else { + resolve(body); + } + }) + ); +} + +describe("Test that we can read metrics using the API", function() { + it('can fetch default metrics', async function() { + console.debug("config", this.mjolnir.config); + let metrics = await fetchMetrics(this.mjolnir.config); + console.debug("Got metrics", metrics); + + // Sample of Prometheus-recommended metrics. + for (let name of ["process_cpu_seconds_total"]) { + assert(metrics.includes(name), `Metrics should contain default metric \`${name}\``); + } + + // Sample of metrics that we're injecting. + for (let name of ["mjolnir_performance_http_request_sum", "mjolnir_status_api_request_pass", "mjolnir_status_api_request_fail"]) { + assert(metrics.includes(name), `Metrics should contain custom metric \`${name}\``); + } + }) +}); \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json index f048822e..ffdc6a66 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -27,5 +27,6 @@ "./test/integration/banListTest.ts", "./test/integration/reportPollingTest", "./test/integration/policyConsumptionTest.ts", + "./test/integration/openMetricsTest.ts", ] }