From 8fec57c1cfa1620bdc07071ea7893b84aea2858f Mon Sep 17 00:00:00 2001 From: Kristaps Fabians Geikins Date: Tue, 21 Jan 2025 10:43:49 +0200 Subject: [PATCH 1/3] feat(server): improving knex query stack traces --- packages/server/bootstrap.js | 5 +++- packages/server/logging/knexMonitoring.ts | 30 +++++++++++++------- packages/server/modules/core/patches/knex.ts | 23 +++++++++++++++ packages/server/type-augmentations/knex.d.ts | 22 ++++++++++++++ 4 files changed, 69 insertions(+), 11 deletions(-) create mode 100644 packages/server/modules/core/patches/knex.ts create mode 100644 packages/server/type-augmentations/knex.d.ts diff --git a/packages/server/bootstrap.js b/packages/server/bootstrap.js index 0cdb3ef547..d1d60e7e6a 100644 --- a/packages/server/bootstrap.js +++ b/packages/server/bootstrap.js @@ -30,10 +30,13 @@ if (isApolloMonitoringEnabled() && !getApolloServerVersion()) { process.env.APOLLO_SERVER_USER_VERSION = getServerVersion() } -//knex is a singleton controlled by module so can't wait til app init +// knex is a singleton controlled by module so can't wait til app init const { initOpenTelemetry } = require('./otel') initOpenTelemetry() +const { patchKnex } = require('./modules/core/patches/knex') +patchKnex() + // If running in test env, load .env.test first // (appRoot necessary, cause env files aren't loaded through require() calls) if (isTestEnv()) { diff --git a/packages/server/logging/knexMonitoring.ts b/packages/server/logging/knexMonitoring.ts index 51d3eaf6e0..e79787815b 100644 --- a/packages/server/logging/knexMonitoring.ts +++ b/packages/server/logging/knexMonitoring.ts @@ -183,6 +183,7 @@ interface QueryEvent extends Knex.Sql { __knexUid: string __knexTxId: string __knexQueryUid: string + __stackTrace: string } const initKnexPrometheusMetricsForRegionEvents = async (params: { @@ -191,20 +192,29 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { logger: Logger }) => { const { region, db } = params - const queryStartTime: Record = {} + const queryMetadata: Record = {} const connectionAcquisitionStartTime: Record = {} const connectionInUseStartTime: Record = {} + db.on('start', (builder: Knex.QueryBuilder) => { + console.log(!!builder) + }) + db.on('query', (data: QueryEvent) => { const queryId = data.__knexQueryUid + '' - queryStartTime[queryId] = performance.now() + queryMetadata[queryId] = { + startTime: performance.now(), + stackTrace: data.__stackTrace + } }) db.on('query-response', (_response: unknown, data: QueryEvent) => { const queryId = data.__knexQueryUid + '' - const durationMs = performance.now() - queryStartTime[queryId] + const { startTime, stackTrace } = queryMetadata[queryId] + + const durationMs = performance.now() - startTime const durationSec = toNDecimalPlaces(durationMs / 1000, 2) - delete queryStartTime[queryId] + delete queryMetadata[queryId] if (!isNaN(durationSec)) metricQueryDuration .labels({ @@ -214,7 +224,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { }) .observe(durationSec) - const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim() const reqCtx = getRequestContext() // Update reqCtx with DB query metrics @@ -231,7 +240,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { sqlQueryId: queryId, sqlQueryDurationMs: toNDecimalPlaces(durationMs, 0), sqlNumberBindings: data.bindings?.length || -1, - trace, + trace: stackTrace, ...(reqCtx ? { req: { id: reqCtx.requestId } } : {}) }, 'DB query successfully completed after {sqlQueryDurationMs} ms' @@ -240,9 +249,11 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { db.on('query-error', (err: unknown, data: QueryEvent) => { const queryId = data.__knexQueryUid + '' - const durationMs = performance.now() - queryStartTime[queryId] + const { startTime, stackTrace } = queryMetadata[queryId] + + const durationMs = performance.now() - startTime const durationSec = toNDecimalPlaces(durationMs / 1000, 2) - delete queryStartTime[queryId] + delete queryMetadata[queryId] if (!isNaN(durationSec)) metricQueryDuration @@ -254,7 +265,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { .observe(durationSec) metricQueryErrors.inc() - const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim() const reqCtx = getRequestContext() // Update reqCtx with DB query metrics @@ -272,7 +282,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { sqlQueryId: queryId, sqlQueryDurationMs: toNDecimalPlaces(durationMs, 0), sqlNumberBindings: data.bindings?.length || -1, - trace, + trace: stackTrace, ...(reqCtx ? { req: { id: reqCtx.requestId } } : {}) }, 'DB query errored for {sqlMethod} after {sqlQueryDurationMs}ms' diff --git a/packages/server/modules/core/patches/knex.ts b/packages/server/modules/core/patches/knex.ts new file mode 100644 index 0000000000..1ccda70cf5 --- /dev/null +++ b/packages/server/modules/core/patches/knex.ts @@ -0,0 +1,23 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +import knexQueryCompiler from 'knex/lib/query/querycompiler' + +/** + * ⚠️ HERE BE DARKNESS! ⚠️ + * We're monkey patching knex which is pretty cursed, but i'm not aware of any other way + * to get a proper stack trace in query event handlers + */ + +export const patchKnex = () => { + // Preserve stack trace on query compilation + const originalToSQL = knexQueryCompiler.prototype.toSQL + const newToSQL: typeof originalToSQL = function ( + this: typeof knexQueryCompiler.prototype, + ...args: any + ) { + const ret = originalToSQL.apply(this, args) + ret.__stackTrace = (new Error().stack || '').split('\n').slice(1).join('\n').trim() + return ret + } + + knexQueryCompiler.prototype.toSQL = newToSQL +} diff --git a/packages/server/type-augmentations/knex.d.ts b/packages/server/type-augmentations/knex.d.ts new file mode 100644 index 0000000000..035be8953d --- /dev/null +++ b/packages/server/type-augmentations/knex.d.ts @@ -0,0 +1,22 @@ +// used in patches/knex.ts +declare module 'knex/lib/query/querycompiler' { + declare class QueryCompiler { + toSQL( + method: string, + tz: string + ): { + method: string + options: Record + timeout: number | false + cancelOnTimeout: boolean + bindings: unknown[] + __knexQueryUid: string + /** + * This one's patched in by us for telemetry purposes + */ + __stackTrace: string + } + } + + export default QueryCompiler +} From 46ace328d1fbed70f3be1a5f441e5e6b6a26e9be Mon Sep 17 00:00:00 2001 From: Kristaps Fabians Geikins Date: Tue, 21 Jan 2025 10:46:43 +0200 Subject: [PATCH 2/3] minor cleanup --- packages/server/logging/knexMonitoring.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/packages/server/logging/knexMonitoring.ts b/packages/server/logging/knexMonitoring.ts index e79787815b..9e03aa05de 100644 --- a/packages/server/logging/knexMonitoring.ts +++ b/packages/server/logging/knexMonitoring.ts @@ -196,10 +196,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { const connectionAcquisitionStartTime: Record = {} const connectionInUseStartTime: Record = {} - db.on('start', (builder: Knex.QueryBuilder) => { - console.log(!!builder) - }) - db.on('query', (data: QueryEvent) => { const queryId = data.__knexQueryUid + '' queryMetadata[queryId] = { @@ -210,7 +206,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { db.on('query-response', (_response: unknown, data: QueryEvent) => { const queryId = data.__knexQueryUid + '' - const { startTime, stackTrace } = queryMetadata[queryId] + const { startTime = NaN, stackTrace = undefined } = queryMetadata[queryId] || {} const durationMs = performance.now() - startTime const durationSec = toNDecimalPlaces(durationMs / 1000, 2) @@ -249,7 +245,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { db.on('query-error', (err: unknown, data: QueryEvent) => { const queryId = data.__knexQueryUid + '' - const { startTime, stackTrace } = queryMetadata[queryId] + const { startTime = NaN, stackTrace = undefined } = queryMetadata[queryId] || {} const durationMs = performance.now() - startTime const durationSec = toNDecimalPlaces(durationMs / 1000, 2) From f353dc1fea0734b7a6b84b815cf8629e16c21c79 Mon Sep 17 00:00:00 2001 From: Kristaps Fabians Geikins Date: Tue, 21 Jan 2025 11:14:30 +0200 Subject: [PATCH 3/3] more improvements --- packages/server/bootstrap.js | 14 +++++----- packages/server/logging/knexMonitoring.ts | 7 +++-- packages/server/modules/core/patches/knex.ts | 26 +++++++++++-------- .../modules/shared/helpers/envHelper.ts | 4 +++ packages/shared/src/core/helpers/error.ts | 11 ++++++++ .../speckle-server/templates/_helpers.tpl | 5 ++++ utils/helm/speckle-server/values.schema.json | 5 ++++ utils/helm/speckle-server/values.yaml | 3 +++ 8 files changed, 55 insertions(+), 20 deletions(-) diff --git a/packages/server/bootstrap.js b/packages/server/bootstrap.js index d1d60e7e6a..35fc7fc188 100644 --- a/packages/server/bootstrap.js +++ b/packages/server/bootstrap.js @@ -30,13 +30,6 @@ if (isApolloMonitoringEnabled() && !getApolloServerVersion()) { process.env.APOLLO_SERVER_USER_VERSION = getServerVersion() } -// knex is a singleton controlled by module so can't wait til app init -const { initOpenTelemetry } = require('./otel') -initOpenTelemetry() - -const { patchKnex } = require('./modules/core/patches/knex') -patchKnex() - // If running in test env, load .env.test first // (appRoot necessary, cause env files aren't loaded through require() calls) if (isTestEnv()) { @@ -52,6 +45,13 @@ if (isTestEnv()) { dotenv.config({ path: `${packageRoot}/.env` }) +// knex is a singleton controlled by module so can't wait til app init +const { initOpenTelemetry } = require('./otel') +initOpenTelemetry() + +const { patchKnex } = require('./modules/core/patches/knex') +patchKnex() + module.exports = { appRoot, packageRoot diff --git a/packages/server/logging/knexMonitoring.ts b/packages/server/logging/knexMonitoring.ts index 9e03aa05de..604eb69451 100644 --- a/packages/server/logging/knexMonitoring.ts +++ b/packages/server/logging/knexMonitoring.ts @@ -5,6 +5,7 @@ import { Logger } from 'pino' import { toNDecimalPlaces } from '@/modules/core/utils/formatting' import { omit } from 'lodash' import { getRequestContext } from '@/logging/requestContext' +import { collectLongTrace } from '@speckle/shared' let metricQueryDuration: prometheusClient.Summary let metricQueryErrors: prometheusClient.Counter @@ -228,6 +229,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { reqCtx.dbMetrics.totalDuration += durationMs || 0 } + const trace = stackTrace || collectLongTrace() params.logger.info( { region, @@ -236,7 +238,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { sqlQueryId: queryId, sqlQueryDurationMs: toNDecimalPlaces(durationMs, 0), sqlNumberBindings: data.bindings?.length || -1, - trace: stackTrace, + trace, ...(reqCtx ? { req: { id: reqCtx.requestId } } : {}) }, 'DB query successfully completed after {sqlQueryDurationMs} ms' @@ -269,6 +271,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { reqCtx.dbMetrics.totalDuration += durationMs || 0 } + const trace = stackTrace || collectLongTrace() params.logger.warn( { err: typeof err === 'object' ? omit(err, 'detail') : err, @@ -278,7 +281,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { sqlQueryId: queryId, sqlQueryDurationMs: toNDecimalPlaces(durationMs, 0), sqlNumberBindings: data.bindings?.length || -1, - trace: stackTrace, + trace, ...(reqCtx ? { req: { id: reqCtx.requestId } } : {}) }, 'DB query errored for {sqlMethod} after {sqlQueryDurationMs}ms' diff --git a/packages/server/modules/core/patches/knex.ts b/packages/server/modules/core/patches/knex.ts index 1ccda70cf5..3db5e65ca0 100644 --- a/packages/server/modules/core/patches/knex.ts +++ b/packages/server/modules/core/patches/knex.ts @@ -1,4 +1,6 @@ /* eslint-disable @typescript-eslint/no-explicit-any */ +import { enableImprovedKnexTelemetryStackTraces } from '@/modules/shared/helpers/envHelper' +import { collectLongTrace } from '@speckle/shared' import knexQueryCompiler from 'knex/lib/query/querycompiler' /** @@ -8,16 +10,18 @@ import knexQueryCompiler from 'knex/lib/query/querycompiler' */ export const patchKnex = () => { - // Preserve stack trace on query compilation - const originalToSQL = knexQueryCompiler.prototype.toSQL - const newToSQL: typeof originalToSQL = function ( - this: typeof knexQueryCompiler.prototype, - ...args: any - ) { - const ret = originalToSQL.apply(this, args) - ret.__stackTrace = (new Error().stack || '').split('\n').slice(1).join('\n').trim() - return ret - } + if (enableImprovedKnexTelemetryStackTraces()) { + // Preserve stack trace on query compilation + const originalToSQL = knexQueryCompiler.prototype.toSQL + const newToSQL: typeof originalToSQL = function ( + this: typeof knexQueryCompiler.prototype, + ...args: any + ) { + const ret = originalToSQL.apply(this, args) + ret.__stackTrace = collectLongTrace() + return ret + } - knexQueryCompiler.prototype.toSQL = newToSQL + knexQueryCompiler.prototype.toSQL = newToSQL + } } diff --git a/packages/server/modules/shared/helpers/envHelper.ts b/packages/server/modules/shared/helpers/envHelper.ts index f3e2f65f51..e47f9665c7 100644 --- a/packages/server/modules/shared/helpers/envHelper.ts +++ b/packages/server/modules/shared/helpers/envHelper.ts @@ -443,3 +443,7 @@ export const knexAsyncStackTracesEnabled = () => { export const asyncRequestContextEnabled = () => { return getBooleanFromEnv('ASYNC_REQUEST_CONTEXT_ENABLED') } + +export function enableImprovedKnexTelemetryStackTraces() { + return getBooleanFromEnv('KNEX_IMPROVED_TELEMETRY_STACK_TRACES') +} diff --git a/packages/shared/src/core/helpers/error.ts b/packages/shared/src/core/helpers/error.ts index 3fb35b02bb..3a16cdd167 100644 --- a/packages/shared/src/core/helpers/error.ts +++ b/packages/shared/src/core/helpers/error.ts @@ -20,3 +20,14 @@ export function throwUncoveredError(e: never): never { export function createUncoveredError(e: never) { return new Error(`Uncovered error case ${e}.`) } + +/** + * Note: Only V8 and Node.js support controlling the stack trace limit + */ +export const collectLongTrace = (limit?: number) => { + const originalLimit = Error.stackTraceLimit + Error.stackTraceLimit = limit || 30 + const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim() + Error.stackTraceLimit = originalLimit + return trace +} diff --git a/utils/helm/speckle-server/templates/_helpers.tpl b/utils/helm/speckle-server/templates/_helpers.tpl index fd27441fcf..d4c1f877de 100644 --- a/utils/helm/speckle-server/templates/_helpers.tpl +++ b/utils/helm/speckle-server/templates/_helpers.tpl @@ -782,6 +782,11 @@ Generate the environment variables for Speckle server and Speckle objects deploy value: {{ .Values.db.knexAsyncStackTracesEnabled | quote }} {{- end}} +{{- if .Values.db.knexImprovedTelemetryStackTraces }} +- name: KNEX_IMPROVED_TELEMETRY_STACK_TRACES + value: {{ .Values.db.knexImprovedTelemetryStackTraces | quote }} +{{- end}} + - name: PGSSLMODE value: "{{ .Values.db.PGSSLMODE }}" diff --git a/utils/helm/speckle-server/values.schema.json b/utils/helm/speckle-server/values.schema.json index 093f61c131..edf79ce327 100644 --- a/utils/helm/speckle-server/values.schema.json +++ b/utils/helm/speckle-server/values.schema.json @@ -246,6 +246,11 @@ "description": "If enabled, will provide better stack traces for errors arising out of knex operations", "default": false }, + "knexImprovedTelemetryStackTraces": { + "type": "boolean", + "description": "If enabled, will provide better stack traces for knex telemetry", + "default": false + }, "databaseName": { "type": "string", "description": "(Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored.", diff --git a/utils/helm/speckle-server/values.yaml b/utils/helm/speckle-server/values.yaml index e3bd1e2338..dffeb5dbfa 100644 --- a/utils/helm/speckle-server/values.yaml +++ b/utils/helm/speckle-server/values.yaml @@ -197,6 +197,9 @@ db: ## @param db.knexAsyncStackTracesEnabled If enabled, will provide better stack traces for errors arising out of knex operations ## knexAsyncStackTracesEnabled: false + ## @param db.knexImprovedTelemetryStackTraces If enabled, will provide better stack traces for knex telemetry + ## + knexImprovedTelemetryStackTraces: false ## @param db.databaseName (Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored. databaseName: ''