From 6c783f3da17dde46d169072b970b1554ff0e2cc0 Mon Sep 17 00:00:00 2001 From: gtr Date: Wed, 8 Feb 2023 17:57:36 -0500 Subject: [PATCH] sql: add stmt error code to execution insights tables Part of: #87785. Previously, the insights subsystem did not keep track of `error code` for failed executions and only contained a `FailedExecution` value for the problem field. This commit adds the `error_code` column to the `crdb_internal.{cluster/node}_execution_insights` virtual tables. This commit also bubbles up that error code at the transaction level for writing into the `crdb_internal.{cluster/node}_txn_execution_insights` virtual table. Release note (sql change): Adds `error_code` column to the `crdb_internal.{cluster/node}_execution_insights` virtual tables, which contains the error code for a failed execution. Adds `last_error_code` column to the `crdb_internal.{cluster/node}_txn_execution_insights` virtual tables, which is the error code of the last failed statement in that transaction. --- .../testdata/logic_test/crdb_internal_tenant | 18 ++++-- pkg/sql/crdb_internal.go | 15 ++++- .../testdata/logic_test/crdb_internal_catalog | 4 +- pkg/sql/sqlstats/insights/BUILD.bazel | 1 + pkg/sql/sqlstats/insights/detector.go | 4 ++ pkg/sql/sqlstats/insights/detector_test.go | 60 +++++++++++++++++++ pkg/sql/sqlstats/insights/insights.proto | 1 + .../insights/integration/insights_test.go | 12 ++-- pkg/sql/sqlstats/insights/registry.go | 15 +++-- pkg/sql/sqlstats/insights/registry_test.go | 12 +++- .../sqlstats/ssmemstorage/ss_mem_writer.go | 6 ++ 11 files changed, 127 insertions(+), 21 deletions(-) diff --git a/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant b/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant index 314a2dcdf54b..41662b8f19fe 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant +++ b/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant @@ -310,15 +310,25 @@ SELECT * FROM crdb_internal.node_inflight_trace_spans WHERE span_id < 0 ---- trace_id parent_span_id span_id goroutine_id finished start_time duration operation -query TTTBTTTTTIITITTTTTI colnames +query TTTBTTTTTIITITTTTTTTTTTTTTIT colnames +SELECT * FROM crdb_internal.cluster_execution_insights WHERE query = '' +---- +session_id txn_id txn_fingerprint_id stmt_id stmt_fingerprint_id problem causes query status start_time end_time full_scan user_name app_name database_name plan_gist rows_read rows_written priority retries last_retry_reason exec_node_ids contention contention_events index_recommendations implicit_txn cpu_sql_nanos error_code + +query TTTBTTTTTIITITTTTTTTTTTTTTIT colnames +SELECT * FROM crdb_internal.node_execution_insights WHERE query = '' +---- +session_id txn_id txn_fingerprint_id stmt_id stmt_fingerprint_id problem causes query status start_time end_time full_scan user_name app_name database_name plan_gist rows_read rows_written priority retries last_retry_reason exec_node_ids contention contention_events index_recommendations implicit_txn cpu_sql_nanos error_code + +query TTTBTTTTTIITITTTTTIT colnames SELECT * FROM crdb_internal.cluster_txn_execution_insights WHERE query = '' ---- -txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos +txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos last_error_code -query TTTBTTTTTIITITTTTTI colnames +query TTTBTTTTTIITITTTTTIT colnames SELECT * FROM crdb_internal.node_txn_execution_insights WHERE query = '' ---- -txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos +txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos last_error_code query ITTI SELECT range_id, start_pretty, end_pretty, lease_holder FROM crdb_internal.ranges diff --git a/pkg/sql/crdb_internal.go b/pkg/sql/crdb_internal.go index a97d76ec9dd7..b6de480d5563 100644 --- a/pkg/sql/crdb_internal.go +++ b/pkg/sql/crdb_internal.go @@ -7042,7 +7042,8 @@ CREATE TABLE crdb_internal.%s ( problems STRING[] NOT NULL, causes STRING[] NOT NULL, stmt_execution_ids STRING[] NOT NULL, - cpu_sql_nanos INT8 + cpu_sql_nanos INT8, + last_error_code STRING )` var crdbInternalClusterTxnExecutionInsightsTable = virtualSchemaTable{ @@ -7151,6 +7152,13 @@ func populateTxnExecutionInsights( } } + errorCode := tree.DNull + for _, stmt := range insight.Statements { + if stmt.ErrorCode != "" { + errorCode = tree.NewDString(stmt.ErrorCode) + } + } + err = errors.CombineErrors(err, addRow( tree.NewDUuid(tree.DUuid{UUID: insight.Transaction.ID}), tree.NewDBytes(tree.DBytes(sqlstatsutil.EncodeUint64ToBytes(uint64(insight.Transaction.FingerprintID)))), @@ -7171,6 +7179,7 @@ func populateTxnExecutionInsights( causes, stmtIDs, tree.NewDInt(tree.DInt(insight.Transaction.CPUSQLNanos)), + errorCode, )) if err != nil { @@ -7210,7 +7219,8 @@ CREATE TABLE crdb_internal.%s ( contention_events JSONB, index_recommendations STRING[] NOT NULL, implicit_txn BOOL NOT NULL, - cpu_sql_nanos INT8 + cpu_sql_nanos INT8, + error_code STRING )` var crdbInternalClusterExecutionInsightsTable = virtualSchemaTable{ @@ -7341,6 +7351,7 @@ func populateStmtInsights( indexRecommendations, tree.MakeDBool(tree.DBool(insight.Transaction.ImplicitTxn)), tree.NewDInt(tree.DInt(s.CPUSQLNanos)), + tree.NewDString(s.ErrorCode), )) } } diff --git a/pkg/sql/logictest/testdata/logic_test/crdb_internal_catalog b/pkg/sql/logictest/testdata/logic_test/crdb_internal_catalog index e2f38e77e3b7..123ec08434ce 100644 --- a/pkg/sql/logictest/testdata/logic_test/crdb_internal_catalog +++ b/pkg/sql/logictest/testdata/logic_test/crdb_internal_catalog @@ -420,7 +420,7 @@ SELECT id, strip_volatile(descriptor) FROM crdb_internal.kv_catalog_descriptor 4294967245 {"table": {"columns": [{"id": 1, "name": "node_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 2, "name": "session_id", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 3, "name": "user_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "client_address", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 5, "name": "application_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 6, "name": "active_queries", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 7, "name": "last_active_query", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 8, "name": "num_txns_executed", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 9, "name": "session_start", "nullable": true, "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 10, "name": "active_query_start", "nullable": true, "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 11, "name": "kv_txn", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 12, "name": "alloc_bytes", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 13, "name": "max_alloc_bytes", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 14, "name": "status", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 15, "name": "session_end", "nullable": true, "type": {"family": "TimestampFamily", "oid": 1114}}], "formatVersion": 3, "id": 4294967245, "name": "node_sessions", "nextColumnId": 16, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967246 {"table": {"columns": [{"id": 1, "name": "id", "nullable": true, "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 2, "name": "node_id", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 3, "name": "session_id", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "start", "nullable": true, "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 5, "name": "txn_string", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 6, "name": "application_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 7, "name": "num_stmts", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 8, "name": "num_retries", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 9, "name": "num_auto_retries", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 10, "name": "last_auto_retry_reason", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}], "formatVersion": 3, "id": 4294967246, "name": "node_transactions", "nextColumnId": 11, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967247 {"table": {"columns": [{"id": 1, "name": "query_id", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 2, "name": "txn_id", "nullable": true, "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 3, "name": "node_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 4, "name": "session_id", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 5, "name": "user_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 6, "name": "start", "nullable": true, "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 7, "name": "query", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 8, "name": "client_address", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 9, "name": "application_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 10, "name": "distributed", "nullable": true, "type": {"oid": 16}}, {"id": 11, "name": "phase", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 12, "name": "full_scan", "nullable": true, "type": {"oid": 16}}, {"id": 13, "name": "plan_gist", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 14, "name": "database", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}], "formatVersion": 3, "id": 4294967247, "name": "node_queries", "nextColumnId": 15, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} -4294967248 {"table": {"columns": [{"id": 1, "name": "session_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 2, "name": "txn_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 3, "name": "txn_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 4, "name": "stmt_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 5, "name": "stmt_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 6, "name": "problem", "type": {"family": "StringFamily", "oid": 25}}, {"id": 7, "name": "causes", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 8, "name": "query", "type": {"family": "StringFamily", "oid": 25}}, {"id": 9, "name": "status", "type": {"family": "StringFamily", "oid": 25}}, {"id": 10, "name": "start_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 11, "name": "end_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 12, "name": "full_scan", "type": {"oid": 16}}, {"id": 13, "name": "user_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 14, "name": "app_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 15, "name": "database_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 16, "name": "plan_gist", "type": {"family": "StringFamily", "oid": 25}}, {"id": 17, "name": "rows_read", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 18, "name": "rows_written", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 19, "name": "priority", "type": {"family": "StringFamily", "oid": 25}}, {"id": 20, "name": "retries", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 21, "name": "last_retry_reason", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 22, "name": "exec_node_ids", "type": {"arrayContents": {"family": "IntFamily", "oid": 20, "width": 64}, "arrayElemType": "IntFamily", "family": "ArrayFamily", "oid": 1016, "width": 64}}, {"id": 23, "name": "contention", "nullable": true, "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}, {"id": 24, "name": "contention_events", "nullable": true, "type": {"family": "JsonFamily", "oid": 3802}}, {"id": 25, "name": "index_recommendations", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 26, "name": "implicit_txn", "type": {"oid": 16}}, {"id": 27, "name": "cpu_sql_nanos", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}], "formatVersion": 3, "id": 4294967248, "name": "node_execution_insights", "nextColumnId": 28, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} +4294967248 {"table": {"columns": [{"id": 1, "name": "session_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 2, "name": "txn_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 3, "name": "txn_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 4, "name": "stmt_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 5, "name": "stmt_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 6, "name": "problem", "type": {"family": "StringFamily", "oid": 25}}, {"id": 7, "name": "causes", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 8, "name": "query", "type": {"family": "StringFamily", "oid": 25}}, {"id": 9, "name": "status", "type": {"family": "StringFamily", "oid": 25}}, {"id": 10, "name": "start_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 11, "name": "end_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 12, "name": "full_scan", "type": {"oid": 16}}, {"id": 13, "name": "user_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 14, "name": "app_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 15, "name": "database_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 16, "name": "plan_gist", "type": {"family": "StringFamily", "oid": 25}}, {"id": 17, "name": "rows_read", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 18, "name": "rows_written", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 19, "name": "priority", "type": {"family": "StringFamily", "oid": 25}}, {"id": 20, "name": "retries", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 21, "name": "last_retry_reason", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 22, "name": "exec_node_ids", "type": {"arrayContents": {"family": "IntFamily", "oid": 20, "width": 64}, "arrayElemType": "IntFamily", "family": "ArrayFamily", "oid": 1016, "width": 64}}, {"id": 23, "name": "contention", "nullable": true, "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}, {"id": 24, "name": "contention_events", "nullable": true, "type": {"family": "JsonFamily", "oid": 3802}}, {"id": 25, "name": "index_recommendations", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 26, "name": "implicit_txn", "type": {"oid": 16}}, {"id": 27, "name": "cpu_sql_nanos", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 28, "name": "error_code", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}], "formatVersion": 3, "id": 4294967248, "name": "node_execution_insights", "nextColumnId": 29, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967249 {"table": {"columns": [{"id": 1, "name": "flow_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 2, "name": "node_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 3, "name": "stmt", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "since", "type": {"family": "TimestampTZFamily", "oid": 1184}}, {"id": 5, "name": "status", "type": {"family": "StringFamily", "oid": 25}}], "formatVersion": 3, "id": 4294967249, "name": "node_distsql_flows", "nextColumnId": 6, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967250 {"table": {"columns": [{"id": 1, "name": "table_id", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 2, "name": "index_id", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 3, "name": "num_contention_events", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 4, "name": "cumulative_contention_time", "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}, {"id": 5, "name": "key", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 6, "name": "txn_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 7, "name": "count", "type": {"family": "IntFamily", "oid": 20, "width": 64}}], "formatVersion": 3, "id": 4294967250, "name": "node_contention_events", "nextColumnId": 8, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967251 {"table": {"columns": [{"id": 1, "name": "node_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 2, "name": "table_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 3, "name": "name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "parent_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 5, "name": "expiration", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 6, "name": "deleted", "type": {"oid": 16}}], "formatVersion": 3, "id": 4294967251, "name": "leases", "nextColumnId": 7, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} @@ -454,7 +454,7 @@ SELECT id, strip_volatile(descriptor) FROM crdb_internal.kv_catalog_descriptor 4294967279 {"table": {"columns": [{"id": 1, "name": "range_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 2, "name": "table_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 3, "name": "database_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "schema_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 5, "name": "table_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 6, "name": "index_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 7, "name": "lock_key", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 8, "name": "lock_key_pretty", "type": {"family": "StringFamily", "oid": 25}}, {"id": 9, "name": "txn_id", "nullable": true, "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 10, "name": "ts", "nullable": true, "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 11, "name": "lock_strength", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 12, "name": "durability", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 13, "name": "granted", "nullable": true, "type": {"oid": 16}}, {"id": 14, "name": "contended", "type": {"oid": 16}}, {"id": 15, "name": "duration", "nullable": true, "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}], "formatVersion": 3, "id": 4294967279, "indexes": [{"foreignKey": {}, "geoConfig": {}, "id": 2, "interleave": {}, "keyColumnDirections": ["ASC"], "keyColumnIds": [2], "keyColumnNames": ["table_id"], "name": "cluster_locks_table_id_idx", "partitioning": {}, "sharded": {}, "storeColumnIds": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "storeColumnNames": ["range_id", "database_name", "schema_name", "table_name", "index_name", "lock_key", "lock_key_pretty", "txn_id", "ts", "lock_strength", "durability", "granted", "contended", "duration"], "version": 3}, {"foreignKey": {}, "geoConfig": {}, "id": 3, "interleave": {}, "keyColumnDirections": ["ASC"], "keyColumnIds": [3], "keyColumnNames": ["database_name"], "name": "cluster_locks_database_name_idx", "partitioning": {}, "sharded": {}, "storeColumnIds": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "storeColumnNames": ["range_id", "table_id", "schema_name", "table_name", "index_name", "lock_key", "lock_key_pretty", "txn_id", "ts", "lock_strength", "durability", "granted", "contended", "duration"], "version": 3}, {"foreignKey": {}, "geoConfig": {}, "id": 4, "interleave": {}, "keyColumnDirections": ["ASC"], "keyColumnIds": [5], "keyColumnNames": ["table_name"], "name": "cluster_locks_table_name_idx", "partitioning": {}, "sharded": {}, "storeColumnIds": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "storeColumnNames": ["range_id", "table_id", "database_name", "schema_name", "index_name", "lock_key", "lock_key_pretty", "txn_id", "ts", "lock_strength", "durability", "granted", "contended", "duration"], "version": 3}, {"foreignKey": {}, "geoConfig": {}, "id": 5, "interleave": {}, "keyColumnDirections": ["ASC"], "keyColumnIds": [14], "keyColumnNames": ["contended"], "name": "cluster_locks_contended_idx", "partitioning": {}, "sharded": {}, "storeColumnIds": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15], "storeColumnNames": ["range_id", "table_id", "database_name", "schema_name", "table_name", "index_name", "lock_key", "lock_key_pretty", "txn_id", "ts", "lock_strength", "durability", "granted", "duration"], "version": 3}], "name": "cluster_locks", "nextColumnId": 16, "nextConstraintId": 2, "nextIndexId": 6, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967280 {"table": {"columns": [{"id": 1, "name": "txn_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 2, "name": "txn_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 3, "name": "query", "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "implicit_txn", "type": {"oid": 16}}, {"id": 5, "name": "session_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 6, "name": "start_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 7, "name": "end_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 8, "name": "user_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 9, "name": "app_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 10, "name": "rows_read", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 11, "name": "rows_written", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 12, "name": "priority", "type": {"family": "StringFamily", "oid": 25}}, {"id": 13, "name": "retries", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 14, "name": "last_retry_reason", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 15, "name": "contention", "nullable": true, "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}, {"id": 16, "name": "problems", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 17, "name": "causes", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 18, "name": "stmt_execution_ids", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 19, "name": "cpu_sql_nanos", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}], "formatVersion": 3, "id": 4294967280, "name": "node_txn_execution_insights", "nextColumnId": 20, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967281 {"table": {"columns": [{"id": 1, "name": "txn_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 2, "name": "txn_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 3, "name": "query", "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "implicit_txn", "type": {"oid": 16}}, {"id": 5, "name": "session_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 6, "name": "start_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 7, "name": "end_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 8, "name": "user_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 9, "name": "app_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 10, "name": "rows_read", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 11, "name": "rows_written", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 12, "name": "priority", "type": {"family": "StringFamily", "oid": 25}}, {"id": 13, "name": "retries", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 14, "name": "last_retry_reason", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 15, "name": "contention", "nullable": true, "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}, {"id": 16, "name": "problems", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 17, "name": "causes", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 18, "name": "stmt_execution_ids", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 19, "name": "cpu_sql_nanos", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}], "formatVersion": 3, "id": 4294967281, "name": "cluster_txn_execution_insights", "nextColumnId": 20, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} -4294967282 {"table": {"columns": [{"id": 1, "name": "session_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 2, "name": "txn_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 3, "name": "txn_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 4, "name": "stmt_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 5, "name": "stmt_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 6, "name": "problem", "type": {"family": "StringFamily", "oid": 25}}, {"id": 7, "name": "causes", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 8, "name": "query", "type": {"family": "StringFamily", "oid": 25}}, {"id": 9, "name": "status", "type": {"family": "StringFamily", "oid": 25}}, {"id": 10, "name": "start_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 11, "name": "end_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 12, "name": "full_scan", "type": {"oid": 16}}, {"id": 13, "name": "user_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 14, "name": "app_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 15, "name": "database_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 16, "name": "plan_gist", "type": {"family": "StringFamily", "oid": 25}}, {"id": 17, "name": "rows_read", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 18, "name": "rows_written", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 19, "name": "priority", "type": {"family": "StringFamily", "oid": 25}}, {"id": 20, "name": "retries", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 21, "name": "last_retry_reason", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 22, "name": "exec_node_ids", "type": {"arrayContents": {"family": "IntFamily", "oid": 20, "width": 64}, "arrayElemType": "IntFamily", "family": "ArrayFamily", "oid": 1016, "width": 64}}, {"id": 23, "name": "contention", "nullable": true, "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}, {"id": 24, "name": "contention_events", "nullable": true, "type": {"family": "JsonFamily", "oid": 3802}}, {"id": 25, "name": "index_recommendations", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 26, "name": "implicit_txn", "type": {"oid": 16}}, {"id": 27, "name": "cpu_sql_nanos", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}], "formatVersion": 3, "id": 4294967282, "name": "cluster_execution_insights", "nextColumnId": 28, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} +4294967282 {"table": {"columns": [{"id": 1, "name": "session_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 2, "name": "txn_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 3, "name": "txn_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 4, "name": "stmt_id", "type": {"family": "StringFamily", "oid": 25}}, {"id": 5, "name": "stmt_fingerprint_id", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 6, "name": "problem", "type": {"family": "StringFamily", "oid": 25}}, {"id": 7, "name": "causes", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 8, "name": "query", "type": {"family": "StringFamily", "oid": 25}}, {"id": 9, "name": "status", "type": {"family": "StringFamily", "oid": 25}}, {"id": 10, "name": "start_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 11, "name": "end_time", "type": {"family": "TimestampFamily", "oid": 1114}}, {"id": 12, "name": "full_scan", "type": {"oid": 16}}, {"id": 13, "name": "user_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 14, "name": "app_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 15, "name": "database_name", "type": {"family": "StringFamily", "oid": 25}}, {"id": 16, "name": "plan_gist", "type": {"family": "StringFamily", "oid": 25}}, {"id": 17, "name": "rows_read", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 18, "name": "rows_written", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 19, "name": "priority", "type": {"family": "StringFamily", "oid": 25}}, {"id": 20, "name": "retries", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 21, "name": "last_retry_reason", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 22, "name": "exec_node_ids", "type": {"arrayContents": {"family": "IntFamily", "oid": 20, "width": 64}, "arrayElemType": "IntFamily", "family": "ArrayFamily", "oid": 1016, "width": 64}}, {"id": 23, "name": "contention", "nullable": true, "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}, {"id": 24, "name": "contention_events", "nullable": true, "type": {"family": "JsonFamily", "oid": 3802}}, {"id": 25, "name": "index_recommendations", "type": {"arrayContents": {"family": "StringFamily", "oid": 25}, "arrayElemType": "StringFamily", "family": "ArrayFamily", "oid": 1009}}, {"id": 26, "name": "implicit_txn", "type": {"oid": 16}}, {"id": 27, "name": "cpu_sql_nanos", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 28, "name": "error_code", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}], "formatVersion": 3, "id": 4294967282, "name": "cluster_execution_insights", "nextColumnId": 29, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967283 {"table": {"columns": [{"id": 1, "name": "flow_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 2, "name": "node_id", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 3, "name": "stmt", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "since", "type": {"family": "TimestampTZFamily", "oid": 1184}}, {"id": 5, "name": "status", "type": {"family": "StringFamily", "oid": 25}}], "formatVersion": 3, "id": 4294967283, "name": "cluster_distsql_flows", "nextColumnId": 6, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967284 {"table": {"columns": [{"id": 1, "name": "table_id", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 2, "name": "index_id", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 3, "name": "num_contention_events", "type": {"family": "IntFamily", "oid": 20, "width": 64}}, {"id": 4, "name": "cumulative_contention_time", "type": {"family": "IntervalFamily", "intervalDurationField": {}, "oid": 1186}}, {"id": 5, "name": "key", "type": {"family": "BytesFamily", "oid": 17}}, {"id": 6, "name": "txn_id", "type": {"family": "UuidFamily", "oid": 2950}}, {"id": 7, "name": "count", "type": {"family": "IntFamily", "oid": 20, "width": 64}}], "formatVersion": 3, "id": 4294967284, "name": "cluster_contention_events", "nextColumnId": 8, "nextConstraintId": 2, "nextIndexId": 2, "nextMutationId": 1, "primaryIndex": {"constraintId": 1, "foreignKey": {}, "geoConfig": {}, "id": 1, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1"}} 4294967285 {"table": {"columns": [{"id": 1, "name": "database_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 2, "name": "schema_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 3, "name": "table_name", "nullable": true, "type": {"family": "StringFamily", "oid": 25}}, {"id": 4, "name": "num_contention_events", "nullable": true, "type": {"family": "IntFamily", "oid": 20, "width": 64}}], "formatVersion": 3, "id": 4294967285, "name": "cluster_contended_tables", "nextColumnId": 5, "nextConstraintId": 1, "nextMutationId": 1, "primaryIndex": {"foreignKey": {}, "geoConfig": {}, "interleave": {}, "partitioning": {}, "sharded": {}}, "privileges": {"ownerProto": "node", "users": [{"privileges": "32", "userProto": "public"}], "version": 2}, "replacementOf": {"time": {}}, "unexposedParentSchemaId": 4294967295, "version": "1", "viewQuery": "SELECT database_name, schema_name, name, sum(num_contention_events) FROM (SELECT DISTINCT database_name, schema_name, name, index_id, num_contention_events FROM crdb_internal.cluster_contention_events JOIN crdb_internal.tables ON crdb_internal.cluster_contention_events.table_id = crdb_internal.tables.table_id) GROUP BY database_name, schema_name, name"}} diff --git a/pkg/sql/sqlstats/insights/BUILD.bazel b/pkg/sql/sqlstats/insights/BUILD.bazel index a659cbe06640..5ed301a99b2b 100644 --- a/pkg/sql/sqlstats/insights/BUILD.bazel +++ b/pkg/sql/sqlstats/insights/BUILD.bazel @@ -53,6 +53,7 @@ go_test( "//pkg/settings/cluster", "//pkg/sql/appstatspb", "//pkg/sql/clusterunique", + "//pkg/sql/pgwire/pgcode", "//pkg/util/stop", "//pkg/util/uint128", "//pkg/util/uuid", diff --git a/pkg/sql/sqlstats/insights/detector.go b/pkg/sql/sqlstats/insights/detector.go index d2d4dcbceff9..6b927c68edb0 100644 --- a/pkg/sql/sqlstats/insights/detector.go +++ b/pkg/sql/sqlstats/insights/detector.go @@ -165,3 +165,7 @@ func (d *latencyThresholdDetector) enabled() bool { func (d *latencyThresholdDetector) isSlow(s *Statement) bool { return d.enabled() && s.LatencyInSeconds >= LatencyThreshold.Get(&d.st.SV).Seconds() } + +func isFailed(s *Statement) bool { + return s.Status == Statement_Failed +} diff --git a/pkg/sql/sqlstats/insights/detector_test.go b/pkg/sql/sqlstats/insights/detector_test.go index 0c2d26fae5d0..4cbdbe608f90 100644 --- a/pkg/sql/sqlstats/insights/detector_test.go +++ b/pkg/sql/sqlstats/insights/detector_test.go @@ -119,6 +119,66 @@ func TestLatencyQuantileDetector(t *testing.T) { } }) + // Testing the slow and failure detectors at the same time. + t.Run("isSlow and isFailed", func(t *testing.T) { + ctx := context.Background() + st := cluster.MakeTestingClusterSettings() + AnomalyDetectionEnabled.Override(ctx, &st.SV, true) + AnomalyDetectionLatencyThreshold.Override(ctx, &st.SV, 100*time.Millisecond) + + tests := []struct { + name string + seedLatency time.Duration + candidateLatency time.Duration + status Statement_Status + isSlow bool + isFailed bool + }{{ + name: "slow and failed statement", + seedLatency: 100 * time.Millisecond, + candidateLatency: 200 * time.Millisecond, + status: Statement_Failed, + isSlow: true, + isFailed: true, + }, { + name: "slow and non-failed statement", + seedLatency: 100 * time.Millisecond, + candidateLatency: 200 * time.Millisecond, + status: Statement_Completed, + isSlow: true, + isFailed: false, + }, { + name: "fast and non-failed statement", + seedLatency: 100 * time.Millisecond, + candidateLatency: 50 * time.Millisecond, + status: Statement_Completed, + isSlow: false, + isFailed: false, + }, { + name: "fast and failed statement", + seedLatency: 100 * time.Millisecond, + candidateLatency: 50 * time.Millisecond, + status: Statement_Failed, + isSlow: false, + isFailed: true, + }} + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + d := newAnomalyDetector(st, NewMetrics()) + for i := 0; i < 1000; i++ { + d.isSlow(&Statement{LatencyInSeconds: test.seedLatency.Seconds()}) + } + stmt := &Statement{LatencyInSeconds: test.candidateLatency.Seconds(), Status: test.status} + require.Equal(t, test.isSlow, d.isSlow(stmt)) + require.Equal(t, test.isFailed, isFailed(stmt)) + }) + } + }) + + // The statement is fast and has failed. This should hit the failure + // detector but not the slow detector. + t.Run("metrics", func(t *testing.T) { ctx := context.Background() st := cluster.MakeTestingClusterSettings() diff --git a/pkg/sql/sqlstats/insights/insights.proto b/pkg/sql/sqlstats/insights/insights.proto index effb8556a8e3..362525b4f7f7 100644 --- a/pkg/sql/sqlstats/insights/insights.proto +++ b/pkg/sql/sqlstats/insights/insights.proto @@ -120,6 +120,7 @@ message Statement { Problem problem = 21; repeated Cause causes = 22; int64 cpu_sql_nanos = 23 [(gogoproto.customname) = "CPUSQLNanos"]; + string error_code = 24; } diff --git a/pkg/sql/sqlstats/insights/integration/insights_test.go b/pkg/sql/sqlstats/insights/integration/insights_test.go index 900a405e0a3c..efef3cd1d83a 100644 --- a/pkg/sql/sqlstats/insights/integration/insights_test.go +++ b/pkg/sql/sqlstats/insights/integration/insights_test.go @@ -100,7 +100,8 @@ func TestInsightsIntegration(t *testing.T) { "end_time, "+ "full_scan, "+ "implicit_txn, "+ - "cpu_sql_nanos "+ + "cpu_sql_nanos, "+ + "error_code "+ "FROM crdb_internal.node_execution_insights where "+ "query = $1 and app_name = $2 ", "SELECT pg_sleep($1)", appName) @@ -109,7 +110,8 @@ func TestInsightsIntegration(t *testing.T) { var fullScan bool var implicitTxn bool var cpuSQLNanos int64 - err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos) + var errorCode string + err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos, &errorCode) if err != nil { return err @@ -142,7 +144,8 @@ func TestInsightsIntegration(t *testing.T) { "start_time, "+ "end_time, "+ "implicit_txn, "+ - "cpu_sql_nanos "+ + "cpu_sql_nanos, "+ + "error_code "+ "FROM crdb_internal.cluster_txn_execution_insights WHERE "+ "query = $1 and app_name = $2 ", "SELECT pg_sleep($1)", appName) @@ -150,7 +153,8 @@ func TestInsightsIntegration(t *testing.T) { var startInsights, endInsights time.Time var implicitTxn bool var cpuSQLNanos int64 - err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos) + var errorCode string + err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos, &errorCode) if err != nil { return err diff --git a/pkg/sql/sqlstats/insights/registry.go b/pkg/sql/sqlstats/insights/registry.go index 9df960f5caed..f14341b7479a 100644 --- a/pkg/sql/sqlstats/insights/registry.go +++ b/pkg/sql/sqlstats/insights/registry.go @@ -97,10 +97,11 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac delete(r.statements, sessionID) defer statements.release() - var slowStatements intsets.Fast + // Mark statements which are detected as slow or have a failed status. + var slowOrFailedStatements intsets.Fast for i, s := range *statements { - if r.detector.isSlow(s) { - slowStatements.Add(i) + if r.detector.isSlow(s) || isFailed(s) { + slowOrFailedStatements.Add(i) } } @@ -112,8 +113,8 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac highContention = transaction.Contention.Seconds() >= LatencyThreshold.Get(&r.causes.st.SV).Seconds() } - if slowStatements.Empty() && !highContention { - // We only record an insight if we have slow statements or high txn contention. + if slowOrFailedStatements.Empty() && !highContention { + // We only record an insight if we have slow or failed statements or high txn contention. return } @@ -127,14 +128,12 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac } for i, s := range *statements { - if slowStatements.Contains(i) { + if slowOrFailedStatements.Contains(i) { switch s.Status { case Statement_Completed: s.Problem = Problem_SlowExecution s.Causes = r.causes.examine(s.Causes, s) case Statement_Failed: - // Note that we'll be building better failure support for 23.1. - // For now, we only mark failed statements that were also slow. s.Problem = Problem_FailedExecution } diff --git a/pkg/sql/sqlstats/insights/registry_test.go b/pkg/sql/sqlstats/insights/registry_test.go index c1ae8dca95e8..d4cd03be0972 100644 --- a/pkg/sql/sqlstats/insights/registry_test.go +++ b/pkg/sql/sqlstats/insights/registry_test.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/appstatspb" "github.com/cockroachdb/cockroach/pkg/sql/clusterunique" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/stretchr/testify/require" ) @@ -32,6 +33,14 @@ func newStmtWithProblemAndCauses(stmt *Statement, problem Problem, causes []Caus return &newStmt } +// Return a new failed statement with an added errorCode. +func newFailedStmtWithErrorCode(stmt *Statement, errorCode string) *Statement { + newStmt := *stmt + newStmt.Problem = Problem_FailedExecution + newStmt.ErrorCode = errorCode + return &newStmt +} + func TestRegistry(t *testing.T) { ctx := context.Background() @@ -80,6 +89,7 @@ func TestRegistry(t *testing.T) { FingerprintID: appstatspb.StmtFingerprintID(100), LatencyInSeconds: 2, Status: Statement_Failed, + ErrorCode: "22012", } st := cluster.MakeTestingClusterSettings() @@ -93,7 +103,7 @@ func TestRegistry(t *testing.T) { Session: session, Transaction: transaction, Statements: []*Statement{ - newStmtWithProblemAndCauses(statement, Problem_FailedExecution, nil), + newFailedStmtWithErrorCode(statement, pgcode.DivisionByZero.String()), }, }} var actual []*Insight diff --git a/pkg/sql/sqlstats/ssmemstorage/ss_mem_writer.go b/pkg/sql/sqlstats/ssmemstorage/ss_mem_writer.go index 2117c5c81797..b4bbc6909e64 100644 --- a/pkg/sql/sqlstats/ssmemstorage/ss_mem_writer.go +++ b/pkg/sql/sqlstats/ssmemstorage/ss_mem_writer.go @@ -202,6 +202,11 @@ func (s *Container) RecordStatement( cpuSQLNanos = value.ExecStats.CPUTime.Nanoseconds() } + var errorCode string + if value.StatementError != nil { + errorCode = pgerror.GetPGCode(value.StatementError).String() + } + s.insights.ObserveStatement(value.SessionID, &insights.Statement{ ID: value.StatementID, FingerprintID: stmtFingerprintID, @@ -222,6 +227,7 @@ func (s *Container) RecordStatement( IndexRecommendations: value.IndexRecommendations, Database: value.Database, CPUSQLNanos: cpuSQLNanos, + ErrorCode: errorCode, }) return stats.ID, nil