From 92d45df72d5e5716d92bf6b2e8f39faa4e29a1da Mon Sep 17 00:00:00 2001 From: xufei Date: Wed, 6 Jul 2022 20:15:52 +0800 Subject: [PATCH 1/5] refine error message in mpptask Signed-off-by: xufei --- dbms/src/Flash/EstablishCall.cpp | 2 +- dbms/src/Flash/Mpp/MPPTask.cpp | 4 +++- dbms/src/Flash/Mpp/MPPTunnel.cpp | 4 ++++ dbms/src/Flash/Mpp/Utils.cpp | 12 ++++++++++++ dbms/src/Flash/Mpp/Utils.h | 1 + 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/dbms/src/Flash/EstablishCall.cpp b/dbms/src/Flash/EstablishCall.cpp index 89857a2407e..2f8c7c15f56 100644 --- a/dbms/src/Flash/EstablishCall.cpp +++ b/dbms/src/Flash/EstablishCall.cpp @@ -143,7 +143,7 @@ void EstablishCallData::finishTunnelAndResponder() state = FINISH; if (mpp_tunnel) { - mpp_tunnel->consumerFinish("grpc writes failed.", true); //trigger mpp tunnel finish work + mpp_tunnel->consumerFinish(fmt::format("{}: finishTunnelAndResponder called.", mpp_tunnel->id()), true); //trigger mpp tunnel finish work } grpc::Status status(static_cast(GRPC_STATUS_UNKNOWN), "Consumer exits unexpected, grpc writes failed."); responder.Finish(status, this); diff --git a/dbms/src/Flash/Mpp/MPPTask.cpp b/dbms/src/Flash/Mpp/MPPTask.cpp index c2d5e6f49f8..c304d77e90e 100644 --- a/dbms/src/Flash/Mpp/MPPTask.cpp +++ b/dbms/src/Flash/Mpp/MPPTask.cpp @@ -379,7 +379,7 @@ void MPPTask::runImpl() } catch (...) { - err_msg = getCurrentExceptionMessage(true); + err_msg = getCurrentExceptionMessage(true, true); } if (err_msg.empty()) @@ -405,6 +405,8 @@ void MPPTask::runImpl() if (status == RUNNING) { LOG_FMT_ERROR(log, "task running meets error: {}", err_msg); + /// trim the stack trace to avoid too many useless information in log + err_msg = trimStackTrace(err_msg); try { handleError(err_msg); diff --git a/dbms/src/Flash/Mpp/MPPTunnel.cpp b/dbms/src/Flash/Mpp/MPPTunnel.cpp index 13a7eaad95e..16fe4ae42cc 100644 --- a/dbms/src/Flash/Mpp/MPPTunnel.cpp +++ b/dbms/src/Flash/Mpp/MPPTunnel.cpp @@ -220,7 +220,11 @@ void MPPTunnelBase::sendJob(bool need_lock) err_msg = "fatal error in sendJob()"; } if (!err_msg.empty()) + { + /// append tunnel id to error message + err_msg = fmt::format("{} meet error: {}", tunnel_id, err_msg); LOG_ERROR(log, err_msg); + } consumerFinish(err_msg, need_lock); if (is_async) writer->writeDone(grpc::Status::OK); diff --git a/dbms/src/Flash/Mpp/Utils.cpp b/dbms/src/Flash/Mpp/Utils.cpp index 477c478eef7..a9667fa1075 100644 --- a/dbms/src/Flash/Mpp/Utils.cpp +++ b/dbms/src/Flash/Mpp/Utils.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include @@ -27,4 +28,15 @@ mpp::MPPDataPacket getPacketWithError(String reason) return data; } +String trimStackTrace(String message) +{ + auto stack_trace_pos = message.find("Stack trace"); + if (stack_trace_pos != String::npos) + { + message.resize(stack_trace_pos); + Poco::trimRightInPlace(message); + } + return message; +} + } // namespace DB diff --git a/dbms/src/Flash/Mpp/Utils.h b/dbms/src/Flash/Mpp/Utils.h index 67e2dc3f641..117e5f0c734 100644 --- a/dbms/src/Flash/Mpp/Utils.h +++ b/dbms/src/Flash/Mpp/Utils.h @@ -23,5 +23,6 @@ namespace DB { mpp::MPPDataPacket getPacketWithError(String reason); +String trimStackTrace(String message); } // namespace DB From bf17628819195952501d097a6f249aab726255be Mon Sep 17 00:00:00 2001 From: xufei Date: Wed, 6 Jul 2022 21:14:05 +0800 Subject: [PATCH 2/5] refine Signed-off-by: xufei --- dbms/src/Flash/Mpp/MPPTask.cpp | 2 +- dbms/src/Flash/Mpp/Utils.cpp | 3 +-- dbms/src/Flash/Mpp/Utils.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/dbms/src/Flash/Mpp/MPPTask.cpp b/dbms/src/Flash/Mpp/MPPTask.cpp index c304d77e90e..da8f3034abc 100644 --- a/dbms/src/Flash/Mpp/MPPTask.cpp +++ b/dbms/src/Flash/Mpp/MPPTask.cpp @@ -406,7 +406,7 @@ void MPPTask::runImpl() { LOG_FMT_ERROR(log, "task running meets error: {}", err_msg); /// trim the stack trace to avoid too many useless information in log - err_msg = trimStackTrace(err_msg); + trimStackTrace(err_msg); try { handleError(err_msg); diff --git a/dbms/src/Flash/Mpp/Utils.cpp b/dbms/src/Flash/Mpp/Utils.cpp index a9667fa1075..21d89b3cd52 100644 --- a/dbms/src/Flash/Mpp/Utils.cpp +++ b/dbms/src/Flash/Mpp/Utils.cpp @@ -28,7 +28,7 @@ mpp::MPPDataPacket getPacketWithError(String reason) return data; } -String trimStackTrace(String message) +void trimStackTrace(String & message) { auto stack_trace_pos = message.find("Stack trace"); if (stack_trace_pos != String::npos) @@ -36,7 +36,6 @@ String trimStackTrace(String message) message.resize(stack_trace_pos); Poco::trimRightInPlace(message); } - return message; } } // namespace DB diff --git a/dbms/src/Flash/Mpp/Utils.h b/dbms/src/Flash/Mpp/Utils.h index 117e5f0c734..021dc4407d5 100644 --- a/dbms/src/Flash/Mpp/Utils.h +++ b/dbms/src/Flash/Mpp/Utils.h @@ -23,6 +23,6 @@ namespace DB { mpp::MPPDataPacket getPacketWithError(String reason); -String trimStackTrace(String message); +void trimStackTrace(String & message); } // namespace DB From 72deb901ff447dfff92fcdce04592891b206600d Mon Sep 17 00:00:00 2001 From: xufei Date: Thu, 7 Jul 2022 13:12:05 +0800 Subject: [PATCH 3/5] fix ci Signed-off-by: xufei --- dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp b/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp index 47ce2ee6ee6..706c17ed036 100644 --- a/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp +++ b/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp @@ -382,7 +382,7 @@ TEST_F(TestMPPTunnelBase, WriteError) } catch (Exception & e) { - GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, grpc writes failed."); + GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, 0000_0001 meet error: grpc writes failed."); } } @@ -631,7 +631,7 @@ TEST_F(TestMPPTunnelBase, AsyncWriteError) } catch (Exception & e) { - GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, grpc writes failed."); + GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, 0000_0001 meet error: grpc writes failed."); } } From 67e6b7de2d1d306833ef8783c2468b74c49d6825 Mon Sep 17 00:00:00 2001 From: xufei Date: Thu, 7 Jul 2022 13:16:26 +0800 Subject: [PATCH 4/5] fix ci Signed-off-by: xufei --- tests/fullstack-test/mpp/issue_2471.test | 10 +---- tests/fullstack-test/mpp/mpp_fail.test | 50 +++--------------------- 2 files changed, 6 insertions(+), 54 deletions(-) diff --git a/tests/fullstack-test/mpp/issue_2471.test b/tests/fullstack-test/mpp/issue_2471.test index 497ce605893..9966eaadec3 100644 --- a/tests/fullstack-test/mpp/issue_2471.test +++ b/tests/fullstack-test/mpp/issue_2471.test @@ -35,15 +35,7 @@ mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_opt_bro => DBGInvoke __enable_fail_point(exception_in_creating_set_input_stream) mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_opt_broadcast_cartesian_join=2; select * from a as t1 left join a as t2 on t1.id = t2.id; -ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_in_creating_set_input_stream is triggered., e.what() = DB::Exception, Stack trace: -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} +ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_in_creating_set_input_stream is triggered., e.what() = DB::Exception, => DBGInvoke __disable_fail_point(exception_in_creating_set_input_stream) diff --git a/tests/fullstack-test/mpp/mpp_fail.test b/tests/fullstack-test/mpp/mpp_fail.test index e03c6150be6..d45562531df 100644 --- a/tests/fullstack-test/mpp/mpp_fail.test +++ b/tests/fullstack-test/mpp/mpp_fail.test @@ -71,44 +71,20 @@ ERROR 1105 (HY000) at line 1: DB::Exception: Fail point FailPoints::exception_be ## exception during mpp run non root task => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run) mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id; -ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception, Stack trace: -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} +ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception, => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run) ## exception during mpp run root task => DBGInvoke __enable_fail_point(exception_during_mpp_root_task_run) mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id; -ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_root_task_run is triggered., e.what() = DB::Exception, Stack trace: -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} +ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_root_task_run is triggered., e.what() = DB::Exception, => DBGInvoke __disable_fail_point(exception_during_mpp_root_task_run) ## exception during mpp write err to tunnel => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run) => DBGInvoke __enable_fail_point(exception_during_mpp_write_err_to_tunnel) mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id; -ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Failed to write error msg to tunnel, e.what() = DB::Exception, Stack trace: -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} +ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Failed to write error msg to tunnel, e.what() = DB::Exception, => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run) => DBGInvoke __disable_fail_point(exception_during_mpp_write_err_to_tunnel) @@ -116,14 +92,7 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run) => DBGInvoke __enable_fail_point(exception_during_mpp_close_tunnel) mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id; -ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception, Stack trace: -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} +ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception, => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run) => DBGInvoke __disable_fail_point(exception_during_mpp_close_tunnel) @@ -156,16 +125,7 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText ## ensure build1, build2-probe1, probe2 in the CreatingSets, test the bug where build1 throw exception but not change the build state, thus block the build2-probe1, at last this query hangs. => DBGInvoke __enable_fail_point(exception_mpp_hash_build) mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; set @@tidb_broadcast_join_threshold_count=0; set @@tidb_broadcast_join_threshold_size=0; select t1.id from test.t t1 join test.t t2 on t1.id = t2.id and t1.id <2 join (select id from test.t group by id) t3 on t2.id=t3.id; -ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered., e.what() = DB::Exception, Stack trace: -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} -{#LINE} +ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered., e.what() = DB::Exception, => DBGInvoke __disable_fail_point(exception_mpp_hash_build) # Clean up. From 074118c400db05197a40e45100802ba77e97c459 Mon Sep 17 00:00:00 2001 From: xufei Date: Thu, 7 Jul 2022 17:26:20 +0800 Subject: [PATCH 5/5] fix ci Signed-off-by: xufei --- tests/fullstack-test/mpp/mpp_fail.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fullstack-test/mpp/mpp_fail.test b/tests/fullstack-test/mpp/mpp_fail.test index d45562531df..0e272c0b621 100644 --- a/tests/fullstack-test/mpp/mpp_fail.test +++ b/tests/fullstack-test/mpp/mpp_fail.test @@ -71,7 +71,7 @@ ERROR 1105 (HY000) at line 1: DB::Exception: Fail point FailPoints::exception_be ## exception during mpp run non root task => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run) mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id; -ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception, +ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception,, e.what() = DB::Exception, => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run) ## exception during mpp run root task @@ -92,7 +92,7 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run) => DBGInvoke __enable_fail_point(exception_during_mpp_close_tunnel) mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id; -ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception, +ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception,, e.what() = DB::Exception, => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run) => DBGInvoke __disable_fail_point(exception_during_mpp_close_tunnel)