sql: add stmt error code to execution insights tables

Part of: cockroachdb#87785. Previously, the insights subsystem did not keep track of `error code` for failed executions and only contained a `FailedExecution` value for the problem field. This commit adds the `error_code` column to the `crdb_internal.{cluster/node}_execution_insights` virtual tables. This commit also bubbles up that error code at the transaction level for writing into the `crdb_internal.{cluster/node}_txn_execution_insights` virtual table. Release note (sql change): Adds `error_code` column to the `crdb_internal.{cluster/node}_execution_insights` virtual tables, which contains the error code for a failed execution. Adds `last_error_code` column to the `crdb_internal.{cluster/node}_txn_execution_insights` virtual tables, which is the error code of the last failed statement in that transaction.
gtr · Feb 10, 2023 · 6c783f3 · 6c783f3
1 parent f8eb2ef
commit 6c783f3
Show file tree

Hide file tree

Showing 11 changed files with 127 additions and 21 deletions.
diff --git a/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant b/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant
@@ -310,15 +310,25 @@ SELECT * FROM crdb_internal.node_inflight_trace_spans WHERE span_id < 0
 ----
 trace_id  parent_span_id  span_id  goroutine_id  finished  start_time  duration  operation
 
-query TTTBTTTTTIITITTTTTI colnames
+query TTTBTTTTTIITITTTTTTTTTTTTTIT colnames
+SELECT * FROM crdb_internal.cluster_execution_insights WHERE query = ''
+----
+session_id  txn_id  txn_fingerprint_id  stmt_id  stmt_fingerprint_id  problem  causes  query  status  start_time  end_time  full_scan  user_name  app_name  database_name  plan_gist  rows_read  rows_written  priority  retries  last_retry_reason  exec_node_ids  contention  contention_events  index_recommendations  implicit_txn  cpu_sql_nanos  error_code
+
+query TTTBTTTTTIITITTTTTTTTTTTTTIT colnames
+SELECT * FROM crdb_internal.node_execution_insights WHERE query = ''
+----
+session_id  txn_id  txn_fingerprint_id  stmt_id  stmt_fingerprint_id  problem  causes  query  status  start_time  end_time  full_scan  user_name  app_name  database_name  plan_gist  rows_read  rows_written  priority  retries  last_retry_reason  exec_node_ids  contention  contention_events  index_recommendations  implicit_txn  cpu_sql_nanos  error_code
+
+query TTTBTTTTTIITITTTTTIT colnames
 SELECT * FROM crdb_internal.cluster_txn_execution_insights WHERE query = ''
 ----
-txn_id  txn_fingerprint_id  query  implicit_txn  session_id  start_time  end_time  user_name  app_name  rows_read  rows_written  priority  retries  last_retry_reason  contention  problems  causes  stmt_execution_ids cpu_sql_nanos
+txn_id  txn_fingerprint_id  query  implicit_txn  session_id  start_time  end_time  user_name  app_name  rows_read  rows_written  priority  retries  last_retry_reason  contention  problems  causes  stmt_execution_ids cpu_sql_nanos last_error_code
 
-query TTTBTTTTTIITITTTTTI colnames
+query TTTBTTTTTIITITTTTTIT colnames
 SELECT * FROM crdb_internal.node_txn_execution_insights WHERE query = ''
 ----
-txn_id  txn_fingerprint_id  query  implicit_txn  session_id  start_time  end_time  user_name  app_name  rows_read  rows_written  priority  retries  last_retry_reason  contention  problems  causes  stmt_execution_ids cpu_sql_nanos
+txn_id  txn_fingerprint_id  query  implicit_txn  session_id  start_time  end_time  user_name  app_name  rows_read  rows_written  priority  retries  last_retry_reason  contention  problems  causes  stmt_execution_ids cpu_sql_nanos last_error_code
 
 query ITTI
 SELECT range_id, start_pretty, end_pretty, lease_holder FROM crdb_internal.ranges

diff --git a/pkg/sql/crdb_internal.go b/pkg/sql/crdb_internal.go
@@ -7042,7 +7042,8 @@ CREATE TABLE crdb_internal.%s (
   problems                   STRING[] NOT NULL,
   causes                     STRING[] NOT NULL,
   stmt_execution_ids         STRING[] NOT NULL,
-  cpu_sql_nanos              INT8
+  cpu_sql_nanos              INT8,
+  last_error_code            STRING
 )`
 
 var crdbInternalClusterTxnExecutionInsightsTable = virtualSchemaTable{
@@ -7151,6 +7152,13 @@ func populateTxnExecutionInsights(
 			}
 		}
 
+		errorCode := tree.DNull
+		for _, stmt := range insight.Statements {
+			if stmt.ErrorCode != "" {
+				errorCode = tree.NewDString(stmt.ErrorCode)
+			}
+		}
+
 		err = errors.CombineErrors(err, addRow(
 			tree.NewDUuid(tree.DUuid{UUID: insight.Transaction.ID}),
 			tree.NewDBytes(tree.DBytes(sqlstatsutil.EncodeUint64ToBytes(uint64(insight.Transaction.FingerprintID)))),
@@ -7171,6 +7179,7 @@ func populateTxnExecutionInsights(
 			causes,
 			stmtIDs,
 			tree.NewDInt(tree.DInt(insight.Transaction.CPUSQLNanos)),
+			errorCode,
 		))
 
 		if err != nil {
@@ -7210,7 +7219,8 @@ CREATE TABLE crdb_internal.%s (
 	contention_events          JSONB,
 	index_recommendations      STRING[] NOT NULL,
 	implicit_txn               BOOL NOT NULL,
-	cpu_sql_nanos              INT8
+	cpu_sql_nanos              INT8,
+	error_code                 STRING
 )`
 
 var crdbInternalClusterExecutionInsightsTable = virtualSchemaTable{
@@ -7341,6 +7351,7 @@ func populateStmtInsights(
 				indexRecommendations,
 				tree.MakeDBool(tree.DBool(insight.Transaction.ImplicitTxn)),
 				tree.NewDInt(tree.DInt(s.CPUSQLNanos)),
+				tree.NewDString(s.ErrorCode),
 			))
 		}
 	}

diff --git a/pkg/sql/logictest/testdata/logic_test/crdb_internal_catalog b/pkg/sql/logictest/testdata/logic_test/crdb_internal_catalog
diff --git a/pkg/sql/sqlstats/insights/BUILD.bazel b/pkg/sql/sqlstats/insights/BUILD.bazel
@@ -53,6 +53,7 @@ go_test(
         "//pkg/settings/cluster",
         "//pkg/sql/appstatspb",
         "//pkg/sql/clusterunique",
+        "//pkg/sql/pgwire/pgcode",
         "//pkg/util/stop",
         "//pkg/util/uint128",
         "//pkg/util/uuid",

diff --git a/pkg/sql/sqlstats/insights/detector.go b/pkg/sql/sqlstats/insights/detector.go
@@ -165,3 +165,7 @@ func (d *latencyThresholdDetector) enabled() bool {
 func (d *latencyThresholdDetector) isSlow(s *Statement) bool {
 	return d.enabled() && s.LatencyInSeconds >= LatencyThreshold.Get(&d.st.SV).Seconds()
 }
+
+func isFailed(s *Statement) bool {
+	return s.Status == Statement_Failed
+}
diff --git a/pkg/sql/sqlstats/insights/detector_test.go b/pkg/sql/sqlstats/insights/detector_test.go
@@ -119,6 +119,66 @@ func TestLatencyQuantileDetector(t *testing.T) {
 		}
 	})
 
+	// Testing the slow and failure detectors at the same time.
+	t.Run("isSlow and isFailed", func(t *testing.T) {
+		ctx := context.Background()
+		st := cluster.MakeTestingClusterSettings()
+		AnomalyDetectionEnabled.Override(ctx, &st.SV, true)
+		AnomalyDetectionLatencyThreshold.Override(ctx, &st.SV, 100*time.Millisecond)
+
+		tests := []struct {
+			name             string
+			seedLatency      time.Duration
+			candidateLatency time.Duration
+			status           Statement_Status
+			isSlow           bool
+			isFailed         bool
+		}{{
+			name:             "slow and failed statement",
+			seedLatency:      100 * time.Millisecond,
+			candidateLatency: 200 * time.Millisecond,
+			status:           Statement_Failed,
+			isSlow:           true,
+			isFailed:         true,
+		}, {
+			name:             "slow and non-failed statement",
+			seedLatency:      100 * time.Millisecond,
+			candidateLatency: 200 * time.Millisecond,
+			status:           Statement_Completed,
+			isSlow:           true,
+			isFailed:         false,
+		}, {
+			name:             "fast and non-failed statement",
+			seedLatency:      100 * time.Millisecond,
+			candidateLatency: 50 * time.Millisecond,
+			status:           Statement_Completed,
+			isSlow:           false,
+			isFailed:         false,
+		}, {
+			name:             "fast and failed statement",
+			seedLatency:      100 * time.Millisecond,
+			candidateLatency: 50 * time.Millisecond,
+			status:           Statement_Failed,
+			isSlow:           false,
+			isFailed:         true,
+		}}
+
+		for _, test := range tests {
+			t.Run(test.name, func(t *testing.T) {
+				d := newAnomalyDetector(st, NewMetrics())
+				for i := 0; i < 1000; i++ {
+					d.isSlow(&Statement{LatencyInSeconds: test.seedLatency.Seconds()})
+				}
+				stmt := &Statement{LatencyInSeconds: test.candidateLatency.Seconds(), Status: test.status}
+				require.Equal(t, test.isSlow, d.isSlow(stmt))
+				require.Equal(t, test.isFailed, isFailed(stmt))
+			})
+		}
+	})
+
+	// The statement is fast and has failed. This should hit the failure
+	// detector but not the slow detector.
+
 	t.Run("metrics", func(t *testing.T) {
 		ctx := context.Background()
 		st := cluster.MakeTestingClusterSettings()

diff --git a/pkg/sql/sqlstats/insights/insights.proto b/pkg/sql/sqlstats/insights/insights.proto
@@ -120,6 +120,7 @@ message Statement {
   Problem problem = 21;
   repeated Cause causes = 22;
   int64 cpu_sql_nanos = 23 [(gogoproto.customname) = "CPUSQLNanos"];
+  string error_code = 24;
 }
 
 

diff --git a/pkg/sql/sqlstats/insights/integration/insights_test.go b/pkg/sql/sqlstats/insights/integration/insights_test.go
@@ -100,7 +100,8 @@ func TestInsightsIntegration(t *testing.T) {
 			"end_time, "+
 			"full_scan, "+
 			"implicit_txn, "+
-			"cpu_sql_nanos "+
+			"cpu_sql_nanos, "+
+			"error_code "+
 			"FROM crdb_internal.node_execution_insights where "+
 			"query = $1 and app_name = $2 ", "SELECT pg_sleep($1)", appName)
 
@@ -109,7 +110,8 @@ func TestInsightsIntegration(t *testing.T) {
 		var fullScan bool
 		var implicitTxn bool
 		var cpuSQLNanos int64
-		err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos)
+		var errorCode string
+		err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos, &errorCode)
 
 		if err != nil {
 			return err
@@ -142,15 +144,17 @@ func TestInsightsIntegration(t *testing.T) {
 			"start_time, "+
 			"end_time, "+
 			"implicit_txn, "+
-			"cpu_sql_nanos "+
+			"cpu_sql_nanos, "+
+			"error_code "+
 			"FROM crdb_internal.cluster_txn_execution_insights WHERE "+
 			"query = $1 and app_name = $2 ", "SELECT pg_sleep($1)", appName)
 
 		var query string
 		var startInsights, endInsights time.Time
 		var implicitTxn bool
 		var cpuSQLNanos int64
-		err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos)
+		var errorCode string
+		err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos, &errorCode)
 
 		if err != nil {
 			return err

diff --git a/pkg/sql/sqlstats/insights/registry.go b/pkg/sql/sqlstats/insights/registry.go
@@ -97,10 +97,11 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac
 	delete(r.statements, sessionID)
 	defer statements.release()
 
-	var slowStatements intsets.Fast
+	// Mark statements which are detected as slow or have a failed status.
+	var slowOrFailedStatements intsets.Fast
 	for i, s := range *statements {
-		if r.detector.isSlow(s) {
-			slowStatements.Add(i)
+		if r.detector.isSlow(s) || isFailed(s) {
+			slowOrFailedStatements.Add(i)
 		}
 	}
 
@@ -112,8 +113,8 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac
 		highContention = transaction.Contention.Seconds() >= LatencyThreshold.Get(&r.causes.st.SV).Seconds()
 	}
 
-	if slowStatements.Empty() && !highContention {
-		// We only record an insight if we have slow statements or high txn contention.
+	if slowOrFailedStatements.Empty() && !highContention {
+		// We only record an insight if we have slow or failed statements or high txn contention.
 		return
 	}
 
@@ -127,14 +128,12 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac
 	}
 
 	for i, s := range *statements {
-		if slowStatements.Contains(i) {
+		if slowOrFailedStatements.Contains(i) {
 			switch s.Status {
 			case Statement_Completed:
 				s.Problem = Problem_SlowExecution
 				s.Causes = r.causes.examine(s.Causes, s)
 			case Statement_Failed:
-				// Note that we'll be building better failure support for 23.1.
-				// For now, we only mark failed statements that were also slow.
 				s.Problem = Problem_FailedExecution
 			}
 

diff --git a/pkg/sql/sqlstats/insights/registry_test.go b/pkg/sql/sqlstats/insights/registry_test.go
@@ -20,6 +20,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/sql/appstatspb"
 	"github.com/cockroachdb/cockroach/pkg/sql/clusterunique"
+	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
 	"github.com/cockroachdb/cockroach/pkg/util/uuid"
 	"github.com/stretchr/testify/require"
 )
@@ -32,6 +33,14 @@ func newStmtWithProblemAndCauses(stmt *Statement, problem Problem, causes []Caus
 	return &newStmt
 }
 
+// Return a new failed statement with an added errorCode.
+func newFailedStmtWithErrorCode(stmt *Statement, errorCode string) *Statement {
+	newStmt := *stmt
+	newStmt.Problem = Problem_FailedExecution
+	newStmt.ErrorCode = errorCode
+	return &newStmt
+}
+
 func TestRegistry(t *testing.T) {
 	ctx := context.Background()
 
@@ -80,6 +89,7 @@ func TestRegistry(t *testing.T) {
 			FingerprintID:    appstatspb.StmtFingerprintID(100),
 			LatencyInSeconds: 2,
 			Status:           Statement_Failed,
+			ErrorCode:        "22012",
 		}
 
 		st := cluster.MakeTestingClusterSettings()
@@ -93,7 +103,7 @@ func TestRegistry(t *testing.T) {
 			Session:     session,
 			Transaction: transaction,
 			Statements: []*Statement{
-				newStmtWithProblemAndCauses(statement, Problem_FailedExecution, nil),
+				newFailedStmtWithErrorCode(statement, pgcode.DivisionByZero.String()),
 			},
 		}}
 		var actual []*Insight

diff --git a/pkg/sql/sqlstats/ssmemstorage/ss_mem_writer.go b/pkg/sql/sqlstats/ssmemstorage/ss_mem_writer.go
@@ -202,6 +202,11 @@ func (s *Container) RecordStatement(
 		cpuSQLNanos = value.ExecStats.CPUTime.Nanoseconds()
 	}
 
+	var errorCode string
+	if value.StatementError != nil {
+		errorCode = pgerror.GetPGCode(value.StatementError).String()
+	}
+
 	s.insights.ObserveStatement(value.SessionID, &insights.Statement{
 		ID:                   value.StatementID,
 		FingerprintID:        stmtFingerprintID,
@@ -222,6 +227,7 @@ func (s *Container) RecordStatement(
 		IndexRecommendations: value.IndexRecommendations,
 		Database:             value.Database,
 		CPUSQLNanos:          cpuSQLNanos,
+		ErrorCode:            errorCode,
 	})
 
 	return stats.ID, nil