Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(server): improving knex query stack traces #3855

Merged
merged 3 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions packages/server/bootstrap.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ if (isApolloMonitoringEnabled() && !getApolloServerVersion()) {
process.env.APOLLO_SERVER_USER_VERSION = getServerVersion()
}

//knex is a singleton controlled by module so can't wait til app init
const { initOpenTelemetry } = require('./otel')
initOpenTelemetry()

// If running in test env, load .env.test first
// (appRoot necessary, cause env files aren't loaded through require() calls)
if (isTestEnv()) {
Expand All @@ -49,6 +45,13 @@ if (isTestEnv()) {

dotenv.config({ path: `${packageRoot}/.env` })

// knex is a singleton controlled by module so can't wait til app init
const { initOpenTelemetry } = require('./otel')
initOpenTelemetry()

const { patchKnex } = require('./modules/core/patches/knex')
patchKnex()

module.exports = {
appRoot,
packageRoot
Expand Down
25 changes: 17 additions & 8 deletions packages/server/logging/knexMonitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { Logger } from 'pino'
import { toNDecimalPlaces } from '@/modules/core/utils/formatting'
import { omit } from 'lodash'
import { getRequestContext } from '@/logging/requestContext'
import { collectLongTrace } from '@speckle/shared'

let metricQueryDuration: prometheusClient.Summary<string>
let metricQueryErrors: prometheusClient.Counter<string>
Expand Down Expand Up @@ -183,6 +184,7 @@ interface QueryEvent extends Knex.Sql {
__knexUid: string
__knexTxId: string
__knexQueryUid: string
__stackTrace: string
}

const initKnexPrometheusMetricsForRegionEvents = async (params: {
Expand All @@ -191,20 +193,25 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
logger: Logger
}) => {
const { region, db } = params
const queryStartTime: Record<string, number> = {}
const queryMetadata: Record<string, { startTime: number; stackTrace: string }> = {}
const connectionAcquisitionStartTime: Record<string, number> = {}
const connectionInUseStartTime: Record<string, number> = {}

db.on('query', (data: QueryEvent) => {
const queryId = data.__knexQueryUid + ''
queryStartTime[queryId] = performance.now()
queryMetadata[queryId] = {
startTime: performance.now(),
stackTrace: data.__stackTrace
}
})

db.on('query-response', (_response: unknown, data: QueryEvent) => {
const queryId = data.__knexQueryUid + ''
const durationMs = performance.now() - queryStartTime[queryId]
const { startTime = NaN, stackTrace = undefined } = queryMetadata[queryId] || {}

const durationMs = performance.now() - startTime
const durationSec = toNDecimalPlaces(durationMs / 1000, 2)
delete queryStartTime[queryId]
delete queryMetadata[queryId]
if (!isNaN(durationSec))
metricQueryDuration
.labels({
Expand All @@ -214,7 +221,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
})
.observe(durationSec)

const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim()
const reqCtx = getRequestContext()

// Update reqCtx with DB query metrics
Expand All @@ -223,6 +229,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
reqCtx.dbMetrics.totalDuration += durationMs || 0
}

const trace = stackTrace || collectLongTrace()
params.logger.info(
{
region,
Expand All @@ -240,9 +247,11 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {

db.on('query-error', (err: unknown, data: QueryEvent) => {
const queryId = data.__knexQueryUid + ''
const durationMs = performance.now() - queryStartTime[queryId]
const { startTime = NaN, stackTrace = undefined } = queryMetadata[queryId] || {}

const durationMs = performance.now() - startTime
const durationSec = toNDecimalPlaces(durationMs / 1000, 2)
delete queryStartTime[queryId]
delete queryMetadata[queryId]

if (!isNaN(durationSec))
metricQueryDuration
Expand All @@ -254,7 +263,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
.observe(durationSec)
metricQueryErrors.inc()

const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim()
const reqCtx = getRequestContext()

// Update reqCtx with DB query metrics
Expand All @@ -263,6 +271,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
reqCtx.dbMetrics.totalDuration += durationMs || 0
}

const trace = stackTrace || collectLongTrace()
params.logger.warn(
{
err: typeof err === 'object' ? omit(err, 'detail') : err,
Expand Down
27 changes: 27 additions & 0 deletions packages/server/modules/core/patches/knex.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { enableImprovedKnexTelemetryStackTraces } from '@/modules/shared/helpers/envHelper'
import { collectLongTrace } from '@speckle/shared'
import knexQueryCompiler from 'knex/lib/query/querycompiler'

/**
* ⚠️ HERE BE DARKNESS! ⚠️
* We're monkey patching knex which is pretty cursed, but i'm not aware of any other way
* to get a proper stack trace in query event handlers
*/

export const patchKnex = () => {
if (enableImprovedKnexTelemetryStackTraces()) {
// Preserve stack trace on query compilation
const originalToSQL = knexQueryCompiler.prototype.toSQL
const newToSQL: typeof originalToSQL = function (
this: typeof knexQueryCompiler.prototype,
...args: any
) {
const ret = originalToSQL.apply(this, args)
ret.__stackTrace = collectLongTrace()
return ret
}

knexQueryCompiler.prototype.toSQL = newToSQL
}
}
4 changes: 4 additions & 0 deletions packages/server/modules/shared/helpers/envHelper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -443,3 +443,7 @@ export const knexAsyncStackTracesEnabled = () => {
export const asyncRequestContextEnabled = () => {
return getBooleanFromEnv('ASYNC_REQUEST_CONTEXT_ENABLED')
}

export function enableImprovedKnexTelemetryStackTraces() {
return getBooleanFromEnv('KNEX_IMPROVED_TELEMETRY_STACK_TRACES')
}
22 changes: 22 additions & 0 deletions packages/server/type-augmentations/knex.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// used in patches/knex.ts
declare module 'knex/lib/query/querycompiler' {
declare class QueryCompiler {
toSQL(
method: string,
tz: string
): {
method: string
options: Record<string, unknown>
timeout: number | false
cancelOnTimeout: boolean
bindings: unknown[]
__knexQueryUid: string
/**
* This one's patched in by us for telemetry purposes
*/
__stackTrace: string
}
}

export default QueryCompiler
}
11 changes: 11 additions & 0 deletions packages/shared/src/core/helpers/error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,14 @@ export function throwUncoveredError(e: never): never {
export function createUncoveredError(e: never) {
return new Error(`Uncovered error case ${e}.`)
}

/**
* Note: Only V8 and Node.js support controlling the stack trace limit
*/
export const collectLongTrace = (limit?: number) => {
const originalLimit = Error.stackTraceLimit
Error.stackTraceLimit = limit || 30
const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim()
Error.stackTraceLimit = originalLimit
return trace
}
5 changes: 5 additions & 0 deletions utils/helm/speckle-server/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,11 @@ Generate the environment variables for Speckle server and Speckle objects deploy
value: {{ .Values.db.knexAsyncStackTracesEnabled | quote }}
{{- end}}

{{- if .Values.db.knexImprovedTelemetryStackTraces }}
- name: KNEX_IMPROVED_TELEMETRY_STACK_TRACES
value: {{ .Values.db.knexImprovedTelemetryStackTraces | quote }}
{{- end}}

- name: PGSSLMODE
value: "{{ .Values.db.PGSSLMODE }}"

Expand Down
5 changes: 5 additions & 0 deletions utils/helm/speckle-server/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@
"description": "If enabled, will provide better stack traces for errors arising out of knex operations",
"default": false
},
"knexImprovedTelemetryStackTraces": {
"type": "boolean",
"description": "If enabled, will provide better stack traces for knex telemetry",
"default": false
},
"databaseName": {
"type": "string",
"description": "(Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored.",
Expand Down
3 changes: 3 additions & 0 deletions utils/helm/speckle-server/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ db:
## @param db.knexAsyncStackTracesEnabled If enabled, will provide better stack traces for errors arising out of knex operations
##
knexAsyncStackTracesEnabled: false
## @param db.knexImprovedTelemetryStackTraces If enabled, will provide better stack traces for knex telemetry
##
knexImprovedTelemetryStackTraces: false

## @param db.databaseName (Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored.
databaseName: ''
Expand Down
Loading