Skip to content

Commit

Permalink
sql: record error code for failed executions to execution insights
Browse files Browse the repository at this point in the history
tables

Part of: cockroachdb#87785.

Previously, the error codes for failed executions were not written to
the execution insights tables. This commit adds an `error_code` column
to the `crdb_internal.[cluster|node]_execution_insights` virtual tables
to keep track of the error code for a failed stmt execution. This commit
also adds a also adds a `last_error_code` column to to the
`crdb_internal.[cluster|node_txn_execution_insights` virtual tables to
keep track of the error code for the last failed statement for that
transaction.

Release note (sql change): Adds `error_code` column to the
`crdb_internal.[cluster|node]_execution_insights` virtual tables, which
contains the error code for a failed execution. Adds `last_error_code`
column to the `crdb_internal.[cluster|node]_txn_execution_insights`
virtual tables, which is the error code of the last failed statement in
that transaction.
  • Loading branch information
gtr committed Feb 13, 2023
1 parent 5f1a3d7 commit f390703
Show file tree
Hide file tree
Showing 9 changed files with 182 additions and 18 deletions.
18 changes: 14 additions & 4 deletions pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant
Original file line number Diff line number Diff line change
Expand Up @@ -310,15 +310,25 @@ SELECT * FROM crdb_internal.node_inflight_trace_spans WHERE span_id < 0
----
trace_id parent_span_id span_id goroutine_id finished start_time duration operation

query TTTBTTTTTIITITTTTTI colnames
query TTTBTTTTTIITITTTTTTTTTTTTIT colnames
SELECT * FROM crdb_internal.cluster_execution_insights WHERE query = ''
----
session_id txn_id txn_fingerprint_id stmt_id stmt_fingerprint_id problem causes query status start_time end_time full_scan user_name app_name database_name plan_gist rows_read rows_written priority retries last_retry_reason exec_node_ids contention index_recommendations implicit_txn cpu_sql_nanos error_code

query TTTBTTTTTIITITTTTTTTTTTTTIT colnames
SELECT * FROM crdb_internal.node_execution_insights WHERE query = ''
----
session_id txn_id txn_fingerprint_id stmt_id stmt_fingerprint_id problem causes query status start_time end_time full_scan user_name app_name database_name plan_gist rows_read rows_written priority retries last_retry_reason exec_node_ids contention index_recommendations implicit_txn cpu_sql_nanos error_code

query TTTBTTTTTIITITTTTTIT colnames
SELECT * FROM crdb_internal.cluster_txn_execution_insights WHERE query = ''
----
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos last_error_code

query TTTBTTTTTIITITTTTTI colnames
query TTTBTTTTTIITITTTTTIT colnames
SELECT * FROM crdb_internal.node_txn_execution_insights WHERE query = ''
----
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos last_error_code

query ITTI
SELECT range_id, start_pretty, end_pretty, lease_holder FROM crdb_internal.ranges
Expand Down
15 changes: 13 additions & 2 deletions pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -7062,7 +7062,8 @@ CREATE TABLE crdb_internal.%s (
problems STRING[] NOT NULL,
causes STRING[] NOT NULL,
stmt_execution_ids STRING[] NOT NULL,
cpu_sql_nanos INT8
cpu_sql_nanos INT8,
last_error_code STRING
)`

var crdbInternalClusterTxnExecutionInsightsTable = virtualSchemaTable{
Expand Down Expand Up @@ -7171,6 +7172,13 @@ func populateTxnExecutionInsights(
}
}

errorCode := tree.DNull
for _, stmt := range insight.Statements {
if stmt.ErrorCode != "" {
errorCode = tree.NewDString(stmt.ErrorCode)
}
}

err = errors.CombineErrors(err, addRow(
tree.NewDUuid(tree.DUuid{UUID: insight.Transaction.ID}),
tree.NewDBytes(tree.DBytes(sqlstatsutil.EncodeUint64ToBytes(uint64(insight.Transaction.FingerprintID)))),
Expand All @@ -7191,6 +7199,7 @@ func populateTxnExecutionInsights(
causes,
stmtIDs,
tree.NewDInt(tree.DInt(insight.Transaction.CPUSQLNanos)),
errorCode,
))

if err != nil {
Expand Down Expand Up @@ -7229,7 +7238,8 @@ CREATE TABLE crdb_internal.%s (
contention INTERVAL,
index_recommendations STRING[] NOT NULL,
implicit_txn BOOL NOT NULL,
cpu_sql_nanos INT8
cpu_sql_nanos INT8,
error_code STRING
)`

var crdbInternalClusterExecutionInsightsTable = virtualSchemaTable{
Expand Down Expand Up @@ -7348,6 +7358,7 @@ func populateStmtInsights(
indexRecommendations,
tree.MakeDBool(tree.DBool(insight.Transaction.ImplicitTxn)),
tree.NewDInt(tree.DInt(s.CPUSQLNanos)),
tree.NewDString(s.ErrorCode),
))
}
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/sql/logictest/testdata/logic_test/crdb_internal
Original file line number Diff line number Diff line change
Expand Up @@ -461,16 +461,16 @@ SELECT * FROM crdb_internal.ranges_no_leases WHERE range_id < 0
----
range_id start_key start_pretty end_key end_pretty replicas replica_localities voting_replicas non_voting_replicas learner_replicas split_enforced_until

query TTTBTTTTTIITITTTTTI colnames
query TTTBTTTTTIITITTTTTIT colnames
SELECT * FROM crdb_internal.cluster_txn_execution_insights WHERE query = ''
----
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos last_error_code


query TTTBTTTTTIITITTTTTI colnames
query TTTBTTTTTIITITTTTTIT colnames
SELECT * FROM crdb_internal.node_txn_execution_insights WHERE query = ''
----
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos last_error_code


statement ok
Expand Down
8 changes: 4 additions & 4 deletions pkg/sql/logictest/testdata/logic_test/crdb_internal_catalog

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pkg/sql/sqlstats/insights/insights.proto
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ message Transaction {
repeated bytes stmt_execution_ids = 16 [ (gogoproto.customname) = "StmtExecutionIDs",
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/sql/clusterunique.ID"];
int64 cpu_sql_nanos = 17 [(gogoproto.customname) = "CPUSQLNanos"];
// The error code of the last failed statement in the transaction.
string last_error_code = 18;
}

message Statement {
Expand Down Expand Up @@ -119,6 +121,7 @@ message Statement {
Problem problem = 21;
repeated Cause causes = 22;
int64 cpu_sql_nanos = 23 [(gogoproto.customname) = "CPUSQLNanos"];
string error_code = 24;
}


Expand Down
137 changes: 133 additions & 4 deletions pkg/sql/sqlstats/insights/integration/insights_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ func TestInsightsIntegration(t *testing.T) {
"end_time, "+
"full_scan, "+
"implicit_txn, "+
"cpu_sql_nanos "+
"cpu_sql_nanos, "+
"COALESCE(error_code, '') error_code "+
"FROM crdb_internal.node_execution_insights where "+
"query = $1 and app_name = $2 ", "SELECT pg_sleep($1)", appName)

Expand All @@ -109,7 +110,8 @@ func TestInsightsIntegration(t *testing.T) {
var fullScan bool
var implicitTxn bool
var cpuSQLNanos int64
err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos)
var errorCode string
err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos, &errorCode)

if err != nil {
return err
Expand All @@ -119,6 +121,10 @@ func TestInsightsIntegration(t *testing.T) {
return fmt.Errorf("expected 'Completed', but was %s", status)
}

if errorCode != "" {
return fmt.Errorf("expected error code to be '' but was %s", errorCode)
}

delayFromTable := endInsights.Sub(startInsights).Seconds()
if delayFromTable < queryDelayInSeconds {
return fmt.Errorf("expected at least %f, but was %f", delayFromTable, queryDelayInSeconds)
Expand All @@ -142,20 +148,26 @@ func TestInsightsIntegration(t *testing.T) {
"start_time, "+
"end_time, "+
"implicit_txn, "+
"cpu_sql_nanos "+
"cpu_sql_nanos, "+
"COALESCE(last_error_code, '') last_error_code "+
"FROM crdb_internal.cluster_txn_execution_insights WHERE "+
"query = $1 and app_name = $2 ", "SELECT pg_sleep($1)", appName)

var query string
var startInsights, endInsights time.Time
var implicitTxn bool
var cpuSQLNanos int64
err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos)
var lastErrorCode string
err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos, &lastErrorCode)

if err != nil {
return err
}

if lastErrorCode != "" {
return fmt.Errorf("expected last error code to be '' but was %s", lastErrorCode)
}

if !implicitTxn {
return fmt.Errorf("expected implictTxn to be true")
}
Expand All @@ -175,6 +187,123 @@ func TestInsightsIntegration(t *testing.T) {
}, 1*time.Second)
}

func TestFailedInsights(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

const appName = "TestFailedInsights"

// Start the cluster. (One node is sufficient; the outliers system is currently in-memory only.)
ctx := context.Background()
settings := cluster.MakeTestingClusterSettings()
args := base.TestClusterArgs{ServerArgs: base.TestServerArgs{Settings: settings}}
tc := testcluster.StartTestCluster(t, 1, args)
defer tc.Stopper().Stop(ctx)
conn := tc.ServerConn(0)

// Enable detection by setting a latencyThreshold > 0.
latencyThreshold := 20 * time.Millisecond
insights.LatencyThreshold.Override(ctx, &settings.SV, latencyThreshold)

_, err := conn.ExecContext(ctx, "SET SESSION application_name=$1", appName)
require.NoError(t, err)

// See no recorded insights.
var count int
var queryText string
row := conn.QueryRowContext(ctx, "SELECT count(*), coalesce(string_agg(query, ';'),'') "+
"FROM crdb_internal.cluster_execution_insights where app_name = $1 ", appName)
err = row.Scan(&count, &queryText)
require.NoError(t, err)
require.Equal(t, 0, count, "expect:0, actual:%d, queries:%s", count, queryText)

// Run a statement that will fail.
failedQuery := "CREATE TABLE crdb_internal.example (abc INT8)"
_, err = conn.ExecContext(ctx, failedQuery)

// Eventually see one recorded insight.
testutils.SucceedsWithin(t, func() error {
row = conn.QueryRowContext(ctx, "SELECT count(*), coalesce(string_agg(query, ';'),'') "+
"FROM crdb_internal.cluster_execution_insights where app_name = $1 ", appName)
if err = row.Scan(&count, &queryText); err != nil {
return err
}
if count != 1 {
return fmt.Errorf("expected 1, but was %d, queryText:%s", count, queryText)
}
return nil
}, 1*time.Second)

// Verify the table content is valid on the crdb_internal.node_execution_insights table.
testutils.SucceedsWithin(t, func() error {
row = conn.QueryRowContext(ctx, "SELECT "+
"query, "+
"status, "+
"start_time, "+
"end_time, "+
"full_scan, "+
"implicit_txn, "+
"cpu_sql_nanos, "+
"COALESCE(error_code, '') error_code "+
"FROM crdb_internal.node_execution_insights where "+
"query = $1 AND app_name = $2 ", failedQuery, appName)

var query, status string
var startInsights, endInsights time.Time
var fullScan bool
var implicitTxn bool
var cpuSQLNanos int64
var errorCode string
err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos, &errorCode)

if err != nil {
return err
}

if status != "Failed" {
return fmt.Errorf("expected 'Failed', but was %s", status)
}

// The error code for the failed statement should be '42501', insufficient privilege.
if errorCode != "42501" {
return fmt.Errorf("expected error code to be '42501' but was %s", errorCode)
}

return nil
}, 1*time.Second)

// Verify the table content is valid on the crdb_internal.node_txn_execution_insights table.
testutils.SucceedsWithin(t, func() error {
row = conn.QueryRowContext(ctx, "SELECT "+
"query, "+
"start_time, "+
"end_time, "+
"implicit_txn, "+
"cpu_sql_nanos, "+
"COALESCE(last_error_code, '') last_error_code "+
"FROM crdb_internal.node_txn_execution_insights where "+
"query = $1 AND app_name = $2 ", failedQuery, appName)

var query string
var startInsights, endInsights time.Time
var implicitTxn bool
var cpuSQLNanos int64
var lastErrorCode string
err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos, &lastErrorCode)

if err != nil {
return err
}

// The error code for the failed statement should be '42501', insufficient privilege.
if lastErrorCode != "42501" {
return fmt.Errorf("expected error code to be '42501' but was %s", lastErrorCode)
}

return nil
}, 1*time.Second)
}

func TestInsightsPriorityIntegration(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/sqlstats/insights/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,15 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac
insight.Transaction.Causes = addCause(insight.Transaction.Causes, Cause_HighContention)
}

var lastErrorCode string
for i, s := range *statements {
if slowOrFailedStatements.Contains(i) {
switch s.Status {
case Statement_Completed:
s.Problem = Problem_SlowExecution
s.Causes = r.causes.examine(s.Causes, s)
case Statement_Failed:
lastErrorCode = s.ErrorCode
s.Problem = Problem_FailedExecution
}

Expand All @@ -148,6 +150,7 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac
insight.Statements = append(insight.Statements, s)
}

insight.Transaction.LastErrorCode = lastErrorCode
r.sink.AddInsight(insight)
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/sql/sqlstats/insights/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ func TestRegistry(t *testing.T) {
FingerprintID: appstatspb.StmtFingerprintID(100),
LatencyInSeconds: 2,
Status: Statement_Failed,
ErrorCode: "22012",
}

st := cluster.MakeTestingClusterSettings()
Expand All @@ -110,6 +111,7 @@ func TestRegistry(t *testing.T) {
)

require.Equal(t, expected, actual)
require.Equal(t, transaction.LastErrorCode, statement.ErrorCode)
})

t.Run("disabled", func(t *testing.T) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/sqlstats/ssmemstorage/ss_mem_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,11 @@ func (s *Container) RecordStatement(
cpuSQLNanos = value.ExecStats.CPUTime.Nanoseconds()
}

var errorCode string
if value.StatementError != nil {
errorCode = pgerror.GetPGCode(value.StatementError).String()
}

s.insights.ObserveStatement(value.SessionID, &insights.Statement{
ID: value.StatementID,
FingerprintID: stmtFingerprintID,
Expand All @@ -218,6 +223,7 @@ func (s *Container) RecordStatement(
IndexRecommendations: value.IndexRecommendations,
Database: value.Database,
CPUSQLNanos: cpuSQLNanos,
ErrorCode: errorCode,
})

return stats.ID, nil
Expand Down

0 comments on commit f390703

Please sign in to comment.