From 4bbb694691f94c0cfcecd1fdce891e617e6b0eef Mon Sep 17 00:00:00 2001 From: Daniele Sciascia Date: Thu, 11 Nov 2021 11:13:16 +0100 Subject: [PATCH 1/2] Improve error handling of transaction::commit_or_rollback_by_xid() Handle the case where a call to commit_or_rollback_by_xid() is given a xid for which there is no corresponding streaming applier. We distinguish two cases here: 1) the xid might not exist at all, or the corresponding transaction was already committed or rolled back. The client may just return an error; or 2) all streaming appliers have been closed because the node is currently disconnected. We can't tell if a corresponding transaction exists. The client may want to retry. --- src/transaction.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/transaction.cpp b/src/transaction.cpp index 373452a1..c0dc81e7 100644 --- a/src/transaction.cpp +++ b/src/transaction.cpp @@ -1136,8 +1136,22 @@ int wsrep::transaction::commit_or_rollback_by_xid(const wsrep::xid& xid, if (!sa) { - assert(sa); - client_state_.override_error(wsrep::e_error_during_commit); + enum wsrep::provider::status status; + if (server_state.state() == wsrep::server_state::s_disconnected) + { + // The node has disconnected from the cluster, and has closed + // all streaming appliers. We can't tell if a transaction with + // corresponding xid exists. In any case, we can't do much + // while disconnected, the client should retry. + status = wsrep::provider::error_connection_failed; + } + else + { + // The xid never existed, or it was already committed or + // rolled back. + status = wsrep::provider::error_transaction_missing; + } + client_state_.override_error(wsrep::e_error_during_commit, status); return 1; } From 9db81de8dfd5d6b5da596a90433a95d7c6c5a985 Mon Sep 17 00:00:00 2001 From: Daniele Sciascia Date: Fri, 12 Nov 2021 15:17:24 +0100 Subject: [PATCH 2/2] Assert `server_id.is_undefined() == false` failed in certify_fragment() Handle the case where server_id is undefined in before fragment certification. This may happen if the server is disconnected right before replicating a fragment. --- src/transaction.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/transaction.cpp b/src/transaction.cpp index c0dc81e7..2a015a14 100644 --- a/src/transaction.cpp +++ b/src/transaction.cpp @@ -1531,6 +1531,8 @@ int wsrep::transaction::certify_fragment( flags(flags() | wsrep::provider::flag::implicit_deps); } + client_service_.debug_sync("wsrep_before_fragment_append"); + int ret(0); enum wsrep::client_error error(wsrep::e_success); enum wsrep::provider::status cert_ret(wsrep::provider::success); @@ -1549,14 +1551,21 @@ int wsrep::transaction::certify_fragment( // available to store the fragment. The fragment meta data // is updated after certification. wsrep::id server_id(client_state_.server_state().id()); - assert(server_id.is_undefined() == false); - if (storage_service.start_transaction(ws_handle_) || + if (server_id.is_undefined()) + { + // we can't append a fragment with undefined server_id + // server has disconnected? + ret = 1; + error = wsrep::e_append_fragment_error; + } + + if (!ret && (storage_service.start_transaction(ws_handle_) || storage_service.append_fragment( server_id, id(), flags(), wsrep::const_buffer(data.data(), data.size()), - xid())) + xid()))) { ret = 1; error = wsrep::e_append_fragment_error;