From f02a7908c14724903cf0f803547db3ccb0c1a3eb Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Thu, 18 Mar 2021 00:35:43 -0400 Subject: [PATCH] Try to avoid julia becoming unkillable after fatal errors (#40056) - don't smash the alt-stack when already using it - handle jl_critical_error on the original stack, leaving our signal handling thread free to handle more signals (and helping lock corruption detection in some cases) - unblock signals when handling signals: some libc apparently like to block all signals, which can cause mild havoc, since we'd really like the user or bad data to be able to still kill the process (and not just be ignored or cause it to hang) - reset signals to SIG_DFL earlier (so we recurse less) - destroy some state from the Task we co-opted to run the exit handlers, so that it can't accidentally jump back into the running program after we've started tearing down the process, from an untimely ^C (previously ^C might cancel the exit) or a jlbacktrace call. - mark functions as leaf with CFI instead of (potentially) smashing the stack, and add a bit of red-zone if we are recursing (to keep pgcstack sensible) - support safe_restore for the mach catch_exception_raise (while we're trying to generate the backtrace) (cherry picked from commit 107901d09a847eb43c0cde923861b01f969b428d) --- src/gf.c | 2 +- src/julia_internal.h | 32 +++++++- src/signal-handling.c | 41 ++++++++-- src/signals-mach.c | 122 ++++++++++++++++++++---------- src/signals-unix.c | 169 +++++++++++++++++++++++------------------- src/signals-win.c | 9 ++- src/stackwalk.c | 5 +- src/task.c | 12 +-- 8 files changed, 252 insertions(+), 140 deletions(-) diff --git a/src/gf.c b/src/gf.c index 33ec6003c95f9..43d4cf5112b95 100644 --- a/src/gf.c +++ b/src/gf.c @@ -1817,7 +1817,7 @@ static void JL_NORETURN jl_method_error_bare(jl_function_t *f, jl_value_t *args, jl_static_show((JL_STREAM*)STDERR_FILENO,args); jl_printf((JL_STREAM*)STDERR_FILENO,"\n"); jl_ptls_t ptls = jl_get_ptls_states(); ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0); - jl_critical_error(0, NULL, ptls->bt_data, &ptls->bt_size); + jl_critical_error(0, NULL); abort(); } // not reached diff --git a/src/julia_internal.h b/src/julia_internal.h index d1c040d8ee71a..0b766967918e2 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -69,6 +69,36 @@ void __tsan_switch_to_fiber(void *fiber, unsigned flags); # define JL_USE_IFUNC 0 #endif +// If we've smashed the stack, (and not just normal NORETURN) +// this will smash stack-unwind too +#ifdef _OS_WINDOWS_ +#if defined(_CPU_X86_64_) + // install the unhandled exception handler at the top of our stack + // to call directly into our personality handler +#define CFI_NORETURN \ + asm volatile ("\t.seh_handler __julia_personality, @except\n\t.text"); +#else +#define CFI_NORETURN +#endif +#else +// wipe out the call-stack unwind capability beyond this function +// (we are noreturn, so it is not a total lie) +#if defined(_CPU_X86_64_) +// per nongnu libunwind: "x86_64 ABI specifies that end of call-chain is marked with a NULL RBP or undefined return address" +// so we do all 3, to be extra certain of it +#define CFI_NORETURN \ + asm volatile ("\t.cfi_undefined rip"); \ + asm volatile ("\t.cfi_undefined rbp"); \ + asm volatile ("\t.cfi_return_column rbp"); +#else + // per nongnu libunwind: "DWARF spec says undefined return address location means end of stack" + // we use whatever happens to be register 1 on this platform for this +#define CFI_NORETURN \ + asm volatile ("\t.cfi_undefined 1"); \ + asm volatile ("\t.cfi_return_column 1"); +#endif +#endif + // If this is detected in a backtrace of segfault, it means the functions // that use this value must be reworked into their async form with cb arg // provided and with JL_UV_LOCK used around the calls @@ -904,7 +934,7 @@ size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx, jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT; #endif JL_DLLEXPORT jl_value_t *jl_get_backtrace(void); -void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size); +void jl_critical_error(int sig, bt_context_t *context); JL_DLLEXPORT void jl_raise_debugger(void); int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT; diff --git a/src/signal-handling.c b/src/signal-handling.c index 80dfdb3b2fc21..aa642eeedf2a2 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -231,15 +231,44 @@ void jl_show_sigill(void *_ctx) #endif } -// what to do on a critical error -void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size) +// what to do on a critical error on a thread +void jl_critical_error(int sig, bt_context_t *context) { - // This function is not allowed to reference any TLS variables. - // We need to explicitly pass in the TLS buffer pointer when - // we make `jl_filename` and `jl_lineno` thread local. + + jl_ptls_t ptls = jl_get_ptls_states(); + jl_bt_element_t *bt_data = ptls->bt_data; + size_t *bt_size = &ptls->bt_size; size_t i, n = *bt_size; - if (sig) + if (sig) { + // kill this task, so that we cannot get back to it accidentally (via an untimely ^C or jlbacktrace in jl_exit) + ptls->pgcstack = NULL; + ptls->safe_restore = NULL; + if (ptls->current_task) { + ptls->current_task->eh = NULL; + ptls->current_task->excstack = NULL; + } +#ifndef _OS_WINDOWS_ + sigset_t sset; + sigemptyset(&sset); + // n.b. In `abort()`, Apple's libSystem "helpfully" blocks all signals + // on all threads but SIGABRT. But we also don't know what the thread + // was doing, so unblock all critical signals so that they will crash + // hard, and not just get stuck. + sigaddset(&sset, SIGSEGV); + sigaddset(&sset, SIGBUS); + sigaddset(&sset, SIGILL); + // also unblock fatal signals now, so we won't get back here twice + sigaddset(&sset, SIGTERM); + sigaddset(&sset, SIGABRT); + sigaddset(&sset, SIGQUIT); + // and the original signal is now fatal too, in case it wasn't + // something already listed (?) + if (sig != SIGINT) + sigaddset(&sset, sig); + pthread_sigmask(SIG_UNBLOCK, &sset, NULL); +#endif jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig)); + } jl_safe_printf("in expression starting at %s:%d\n", jl_filename, jl_lineno); if (context) { // Must avoid extended backtrace frames here unless we're sure bt_data diff --git a/src/signals-mach.c b/src/signals-mach.c index 3737bab1002cd..0d97d3b0dce56 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -84,6 +84,7 @@ extern boolean_t exc_server(mach_msg_header_t *, mach_msg_header_t *); void *mach_segv_listener(void *arg) { (void)arg; + (void)jl_get_ptls_states(); while (1) { int ret = mach_msg_server(exc_server, 2048, segv_port, MACH_MSG_TIMEOUT_NONE); jl_safe_printf("mach_msg_server: %s\n", mach_error_string(ret)); @@ -91,7 +92,8 @@ void *mach_segv_listener(void *arg) } } -static void allocate_segv_handler() + +static void allocate_mach_handler() { // ensure KEYMGR_GCC3_DW2_OBJ_LIST is initialized, as this requires malloc // and thus can deadlock when used without first initializing it. @@ -122,7 +124,7 @@ static void allocate_segv_handler() jl_error("pthread_create failed"); } pthread_attr_destroy(&attr); - for (int16_t tid = 0;tid < jl_n_threads;tid++) { + for (int16_t tid = 0; tid < jl_n_threads; tid++) { attach_exception_port(pthread_mach_thread_np(jl_all_tls_states[tid]->system_id), 0); } } @@ -164,19 +166,31 @@ typedef arm_exception_state64_t host_exception_state_t; static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state, void (*fptr)(void)) { - uint64_t rsp = (uint64_t)ptls2->signal_stack + sig_stack_size; +#ifdef _CPU_X86_64_ + uintptr_t rsp = state->__rsp; +#elif defined(_CPU_AARCH64_) + uintptr_t rsp = state->__sp; +#else +#error "julia: throw-in-context not supported on this platform" +#endif + if (ptls2->signal_stack == NULL || is_addr_on_sigstack(ptls2, (void*)rsp)) { + rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment + } + else { + rsp = (uintptr_t)ptls2->signal_stack + sig_stack_size; + } assert(rsp % 16 == 0); - // push (null) $RIP onto the stack - rsp -= sizeof(void*); - *(void**)rsp = NULL; - #ifdef _CPU_X86_64_ + rsp -= sizeof(void*); state->__rsp = rsp; // set stack pointer state->__rip = (uint64_t)fptr; // "call" the function -#else +#elif defined(_CPU_AARCH64_) state->__sp = rsp; state->__pc = (uint64_t)fptr; + state->__lr = 0; +#else +#error "julia: throw-in-context not supported on this platform" #endif } @@ -194,11 +208,22 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio ptls2->sig_exception = exception; } jl_call_in_state(ptls2, &state, &jl_sig_throw); - ret = thread_set_state(thread, THREAD_STATE, - (thread_state_t)&state, count); + ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count); HANDLE_MACH_ERROR("thread_set_state", ret); } +static void segv_handler(int sig, siginfo_t *info, void *context) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + assert(sig == SIGSEGV || sig == SIGBUS); + if (ptls->safe_restore) { // restarting jl_ or jl_unwind_stepn + jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw); + } + else { + sigdie_handler(sig, info, context); + } +} + //exc_server uses dlsym to find symbol JL_DLLEXPORT kern_return_t catch_exception_raise(mach_port_t exception_port, @@ -208,18 +233,16 @@ kern_return_t catch_exception_raise(mach_port_t exception_port, exception_data_t code, mach_msg_type_number_t code_count) { - unsigned int count = THREAD_STATE_COUNT; unsigned int exc_count = HOST_EXCEPTION_STATE_COUNT; host_exception_state_t exc_state; - host_thread_state_t state; -#ifdef LIBOSXUNWIND +#ifdef LLVMLIBUNWIND if (thread == mach_profiler_thread) { return profiler_segv_handler(exception_port, thread, task, exception, code, code_count); } #endif int16_t tid; jl_ptls_t ptls2 = NULL; - for (tid = 0;tid < jl_n_threads;tid++) { + for (tid = 0; tid < jl_n_threads; tid++) { jl_ptls_t _ptls2 = jl_all_tls_states[tid]; if (pthread_mach_thread_np(_ptls2->system_id) == thread) { ptls2 = _ptls2; @@ -288,11 +311,8 @@ kern_return_t catch_exception_raise(mach_port_t exception_port, return KERN_SUCCESS; } else { - kern_return_t ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)&state, &count); - HANDLE_MACH_ERROR("thread_get_state", ret); - jl_critical_error(SIGSEGV, (unw_context_t*)&state, - ptls2->bt_data, &ptls2->bt_size); - return KERN_INVALID_ARGUMENT; + jl_exit_thread0(128 + SIGSEGV, NULL, 0); + return KERN_SUCCESS; } } @@ -307,24 +327,27 @@ static void attach_exception_port(thread_port_t thread, int segv_only) HANDLE_MACH_ERROR("thread_set_exception_ports", ret); } -static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) +static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) { jl_ptls_t ptls2 = jl_all_tls_states[tid]; - mach_port_t tid_port = pthread_mach_thread_np(ptls2->system_id); + mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); - kern_return_t ret = thread_suspend(tid_port); + kern_return_t ret = thread_suspend(thread); HANDLE_MACH_ERROR("thread_suspend", ret); // Do the actual sampling unsigned int count = THREAD_STATE_COUNT; - static unw_context_t state; - memset(&state, 0, sizeof(unw_context_t)); + memset(ctx, 0, sizeof(*ctx)); // Get the state of the suspended thread - ret = thread_get_state(tid_port, THREAD_STATE, (thread_state_t)&state, &count); + ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)ctx, &count); +} - // Initialize the unwind context with the suspend thread's state - *ctx = &state; +static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) +{ + static host_thread_state_t state; + jl_thread_suspend_and_get_state2(tid, &state); + *ctx = (unw_context_t*)&state; } static void jl_thread_resume(int tid, int sig) @@ -366,29 +389,46 @@ static void jl_try_deliver_sigint(void) HANDLE_MACH_ERROR("thread_resume", ret); } -static void jl_exit_thread0(int exitstate) +static void JL_NORETURN jl_exit_thread0_cb(int exitstate) +{ +CFI_NORETURN + jl_critical_error(exitstate - 128, NULL); + jl_exit(exitstate); +} + +static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size) { jl_ptls_t ptls2 = jl_all_tls_states[0]; mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); - kern_return_t ret = thread_suspend(thread); - HANDLE_MACH_ERROR("thread_suspend", ret); + + host_thread_state_t state; + jl_thread_suspend_and_get_state2(0, &state); + unw_context_t *uc = (unw_context_t*)&state; // This aborts `sleep` and other syscalls. - ret = thread_abort(thread); + kern_return_t ret = thread_abort(thread); HANDLE_MACH_ERROR("thread_abort", ret); - unsigned int count = THREAD_STATE_COUNT; - host_thread_state_t state; - ret = thread_get_state(thread, THREAD_STATE, - (thread_state_t)&state, &count); + if (bt_data == NULL) { + // Must avoid extended backtrace frames here unless we're sure bt_data + // is properly rooted. + ptls2->bt_size = rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, uc, NULL); + } + else { + ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE + memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0])); + } void (*exit_func)(int) = &_exit; if (thread0_exit_count <= 1) { - exit_func = &jl_exit; + exit_func = &jl_exit_thread0_cb; } else if (thread0_exit_count == 2) { exit_func = &exit; } + else { + exit_func = &_exit; + } #ifdef _CPU_X86_64_ // First integer argument. Not portable but good enough =) @@ -399,8 +439,8 @@ static void jl_exit_thread0(int exitstate) #error Fill in first integer argument here #endif jl_call_in_state(ptls2, &state, (void (*)(void))exit_func); - ret = thread_set_state(thread, THREAD_STATE, - (thread_state_t)&state, count); + unsigned int count = THREAD_STATE_COUNT; + ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count); HANDLE_MACH_ERROR("thread_set_state", ret); ret = thread_resume(thread); @@ -498,8 +538,10 @@ void *mach_profile_listener(void *arg) break; } - unw_context_t *uc; - jl_thread_suspend_and_get_state(i, &uc); + host_thread_state_t state; + jl_thread_suspend_and_get_state2(i, &state); + unw_context_t *uc = (unw_context_t*)&state; + if (running) { #ifdef LIBOSXUNWIND /* diff --git a/src/signals-unix.c b/src/signals-unix.c index 57ce2439fcb90..de3b5e13c98df 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -58,7 +58,9 @@ static bt_context_t *jl_to_bt_context(void *sigctx) #endif } + static int thread0_exit_count = 0; +static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size); static inline __attribute__((unused)) uintptr_t jl_get_rsp_from_ctx(const void *_ctx) { @@ -86,8 +88,17 @@ static inline __attribute__((unused)) uintptr_t jl_get_rsp_from_ctx(const void * #endif } +static int is_addr_on_sigstack(jl_ptls_t ptls, void *ptr) +{ + // One guard page for signal_stack. + return !((char*)ptr < (char*)ptls->signal_stack - jl_page_size || + (char*)ptr > (char*)ptls->signal_stack + sig_stack_size); +} + // Modify signal context `_ctx` so that `fptr` will execute when the signal // returns. `fptr` will execute on the signal stack, and must not return. +// jl_call_in_ctx is also currently executing on that signal stack, +// so be careful not to smash it static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int sig, void *_ctx) { // Modifying the ucontext should work but there is concern that @@ -105,30 +116,32 @@ static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int sig, void *_c fptr(); return; } - uintptr_t rsp = (uintptr_t)ptls->signal_stack + sig_stack_size; + uintptr_t rsp = jl_get_rsp_from_ctx(_ctx); + if (is_addr_on_sigstack(ptls, (void*)rsp)) { + rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment + } + else { + rsp = (uintptr_t)ptls->signal_stack + sig_stack_size; + } assert(rsp % 16 == 0); #if defined(_OS_LINUX_) && defined(_CPU_X86_64_) ucontext_t *ctx = (ucontext_t*)_ctx; rsp -= sizeof(void*); - *(void**)rsp = NULL; ctx->uc_mcontext.gregs[REG_RSP] = rsp; ctx->uc_mcontext.gregs[REG_RIP] = (uintptr_t)fptr; #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_) ucontext_t *ctx = (ucontext_t*)_ctx; rsp -= sizeof(void*); - *(void**)rsp = NULL; ctx->uc_mcontext.mc_rsp = rsp; ctx->uc_mcontext.mc_rip = (uintptr_t)fptr; #elif defined(_OS_LINUX_) && defined(_CPU_X86_) ucontext_t *ctx = (ucontext_t*)_ctx; rsp -= sizeof(void*); - *(void**)rsp = NULL; ctx->uc_mcontext.gregs[REG_ESP] = rsp; ctx->uc_mcontext.gregs[REG_EIP] = (uintptr_t)fptr; #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_) ucontext_t *ctx = (ucontext_t*)_ctx; rsp -= sizeof(void*); - *(void**)rsp = NULL; ctx->uc_mcontext.mc_esp = rsp; ctx->uc_mcontext.mc_eip = (uintptr_t)fptr; #elif defined(_OS_LINUX_) && defined(_CPU_AARCH64_) @@ -162,14 +175,14 @@ static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int sig, void *_c // `catch_exception_raise`. It works fine when a signal is received // due to `kill`/`raise` though. ucontext64_t *ctx = (ucontext64_t*)_ctx; - rsp -= sizeof(void*); - *(void**)rsp = NULL; #if defined(_CPU_X86_64_) + rsp -= sizeof(void*); ctx->uc_mcontext64->__ss.__rsp = rsp; ctx->uc_mcontext64->__ss.__rip = (uintptr_t)fptr; #else ctx->uc_mcontext64->__ss.__sp = rsp; ctx->uc_mcontext64->__ss.__pc = (uintptr_t)fptr; + ctx->uc_mcontext64->__ss.__lr = 0; #endif #else #warning "julia: throw-in-context not supported on this platform" @@ -206,16 +219,11 @@ static int is_addr_on_stack(jl_ptls_t ptls, void *addr) static void sigdie_handler(int sig, siginfo_t *info, void *context) { - jl_ptls_t ptls = jl_get_ptls_states(); - sigset_t sset; + signal(sig, SIG_DFL); uv_tty_reset_mode(); if (sig == SIGILL) jl_show_sigill(context); - jl_critical_error(sig, jl_to_bt_context(context), - ptls->bt_data, &ptls->bt_size); - sigfillset(&sset); - sigprocmask(SIG_UNBLOCK, &sset, NULL); - signal(sig, SIG_DFL); + jl_critical_error(sig, jl_to_bt_context(context)); if (sig != SIGSEGV && sig != SIGBUS && sig != SIGILL) { @@ -228,12 +236,6 @@ static void sigdie_handler(int sig, siginfo_t *info, void *context) #include "signals-mach.c" #else -static int is_addr_on_sigstack(jl_ptls_t ptls, void *ptr) -{ - // One guard page for signal_stack. - return !((char*)ptr < (char*)ptls->signal_stack - jl_page_size || - (char*)ptr > (char*)ptls->signal_stack + sig_stack_size); -} static int jl_is_on_sigstack(jl_ptls_t ptls, void *ptr, void *context) { @@ -245,7 +247,6 @@ static void segv_handler(int sig, siginfo_t *info, void *context) { jl_ptls_t ptls = jl_get_ptls_states(); assert(sig == SIGSEGV || sig == SIGBUS); - if (jl_addr_is_safepoint((uintptr_t)info->si_addr)) { jl_set_gc_and_wait(); // Do not raise sigint on worker thread @@ -284,22 +285,6 @@ static void segv_handler(int sig, siginfo_t *info, void *context) } } -static void allocate_segv_handler(void) -{ - struct sigaction act; - memset(&act, 0, sizeof(struct sigaction)); - sigemptyset(&act.sa_mask); - act.sa_sigaction = segv_handler; - act.sa_flags = SA_ONSTACK | SA_SIGINFO; - if (sigaction(SIGSEGV, &act, NULL) < 0) { - jl_errorf("fatal error: sigaction: %s", strerror(errno)); - } - // On AArch64, stack overflow triggers a SIGBUS - if (sigaction(SIGBUS, &act, NULL) < 0) { - jl_errorf("fatal error: sigaction: %s", strerror(errno)); - } -} - #if !defined(JL_DISABLE_LIBUNWIND) static unw_context_t *volatile signal_context; static pthread_mutex_t in_signal_lock; @@ -319,9 +304,8 @@ static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) static void jl_thread_resume(int tid, int sig) { - (void)sig; jl_ptls_t ptls2 = jl_all_tls_states[tid]; - jl_atomic_store_release(&ptls2->signal_request, 1); + jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1); pthread_cond_broadcast(&exit_signal_cond); pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge assert(jl_atomic_load_acquire(&ptls2->signal_request) == 0); @@ -344,12 +328,14 @@ static void jl_try_deliver_sigint(void) // Write only by signal handling thread, read only by main thread // no sync necessary. static int thread0_exit_state = 0; -static void jl_exit_thread0_cb(void) +static void JL_NORETURN jl_exit_thread0_cb(void) { +CFI_NORETURN // This can get stuck if it happens at an unfortunate spot // (unavoidable due to its async nature). // Try harder to exit each time if we get multiple exit requests. if (thread0_exit_count <= 1) { + jl_critical_error(thread0_exit_state - 128, NULL); jl_exit(thread0_exit_state); } else if (thread0_exit_count == 2) { @@ -360,12 +346,23 @@ static void jl_exit_thread0_cb(void) } } -static void jl_exit_thread0(int state) +static void jl_exit_thread0(int state, jl_bt_element_t *bt_data, size_t bt_size) { jl_ptls_t ptls2 = jl_all_tls_states[0]; - thread0_exit_state = state; - jl_atomic_store_release(&ptls2->signal_request, 3); - pthread_kill(ptls2->system_id, SIGUSR2); + if (thread0_exit_count <= 1) { + unw_context_t *signal_context; + jl_thread_suspend_and_get_state(0, &signal_context); + thread0_exit_state = state; + ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE + memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0])); + jl_thread_resume(0, -1); + } + else { + thread0_exit_state = state; + jl_atomic_store_release(&ptls2->signal_request, 3); + // This also makes sure `sleep` is aborted. + pthread_kill(ptls2->system_id, SIGUSR2); + } } // request: @@ -387,12 +384,10 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx) pthread_cond_broadcast(&signal_caught_cond); pthread_cond_wait(&exit_signal_cond, &in_signal_lock); request = jl_atomic_exchange(&ptls->signal_request, 0); - assert(request == 1); - (void)request; + assert(request == 1 || request == 3); pthread_cond_broadcast(&signal_caught_cond); pthread_mutex_unlock(&in_signal_lock); } - else #endif if (request == 2) { int force = jl_check_force_sigint(); @@ -483,43 +478,42 @@ JL_DLLEXPORT void jl_profile_stop_timer(void) #endif #endif // HAVE_MACH -static void *alloc_sigstack(size_t size) +static void allocate_segv_handler(void) { - size_t pagesz = jl_getpagesize(); - // Add one guard page to catch stack overflow in the signal handler - size = LLT_ALIGN(size, pagesz) + pagesz; - void *stackbuff = mmap(0, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (stackbuff == MAP_FAILED) - jl_errorf("fatal error allocating signal stack: mmap: %s", - strerror(errno)); - mprotect(stackbuff, pagesz, PROT_NONE); - return (void*)((char*)stackbuff + pagesz); + struct sigaction act; + memset(&act, 0, sizeof(struct sigaction)); + sigemptyset(&act.sa_mask); + act.sa_sigaction = segv_handler; + act.sa_flags = SA_ONSTACK | SA_SIGINFO; + if (sigaction(SIGSEGV, &act, NULL) < 0) { + jl_errorf("fatal error: sigaction: %s", strerror(errno)); + } + // On AArch64, stack overflow triggers a SIGBUS + if (sigaction(SIGBUS, &act, NULL) < 0) { + jl_errorf("fatal error: sigaction: %s", strerror(errno)); + } +} + +static void *alloc_sigstack(size_t *ssize) +{ + void *stk = jl_malloc_stack(ssize, NULL); + if (stk == MAP_FAILED) + jl_errorf("fatal error allocating signal stack: mmap: %s", strerror(errno)); + return stk; } void jl_install_thread_signal_handler(jl_ptls_t ptls) { - void *signal_stack = alloc_sigstack(sig_stack_size); + size_t ssize = sig_stack_size; + void *signal_stack = alloc_sigstack(&ssize); + ptls->signal_stack = signal_stack; stack_t ss; ss.ss_flags = 0; - ss.ss_size = sig_stack_size - 16; + ss.ss_size = ssize - 16; ss.ss_sp = signal_stack; if (sigaltstack(&ss, NULL) < 0) { jl_errorf("fatal error: sigaltstack: %s", strerror(errno)); } - -#if !defined(HAVE_MACH) - struct sigaction act; - memset(&act, 0, sizeof(struct sigaction)); - sigemptyset(&act.sa_mask); - act.sa_sigaction = usr2_handler; - act.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTART; - if (sigaction(SIGUSR2, &act, NULL) < 0) { - jl_errorf("fatal error: sigaction: %s", strerror(errno)); - } -#endif - - ptls->signal_stack = signal_stack; } static void jl_sigsetset(sigset_t *sset) @@ -737,10 +731,16 @@ static void *signal_listener(void *arg) // this part is async with the running of the rest of the program // and must be thread-safe, but not necessarily signal-handler safe if (critical) { - jl_critical_error(sig, NULL, bt_data, &bt_size); if (doexit) { thread0_exit_count++; - jl_exit_thread0(128 + sig); + jl_exit_thread0(128 + sig, bt_data, bt_size); + } + else { + jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig)); + size_t i; + for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { + jl_print_bt_entry_codeloc(bt_data + i); + } } } } @@ -787,7 +787,7 @@ void jl_install_default_signal_handlers(void) memset(&actf, 0, sizeof(struct sigaction)); sigemptyset(&actf.sa_mask); actf.sa_sigaction = fpe_handler; - actf.sa_flags = SA_SIGINFO; + actf.sa_flags = SA_ONSTACK | SA_SIGINFO; if (sigaction(SIGFPE, &actf, NULL) < 0) { jl_errorf("fatal error: sigaction: %s", strerror(errno)); } @@ -806,13 +806,26 @@ void jl_install_default_signal_handlers(void) jl_error("fatal error: Couldn't set SIGTRAP"); } +#if defined(HAVE_MACH) + allocate_mach_handler(); +#else + struct sigaction act; + memset(&act, 0, sizeof(struct sigaction)); + sigemptyset(&act.sa_mask); + act.sa_sigaction = usr2_handler; + act.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTART; + if (sigaction(SIGUSR2, &act, NULL) < 0) { + jl_errorf("fatal error: sigaction: %s", strerror(errno)); + } +#endif + allocate_segv_handler(); struct sigaction act_die; memset(&act_die, 0, sizeof(struct sigaction)); sigemptyset(&act_die.sa_mask); act_die.sa_sigaction = sigdie_handler; - act_die.sa_flags = SA_SIGINFO; + act_die.sa_flags = SA_SIGINFO | SA_RESETHAND; if (sigaction(SIGILL, &act_die, NULL) < 0) { jl_errorf("fatal error: sigaction: %s", strerror(errno)); } @@ -823,7 +836,7 @@ void jl_install_default_signal_handlers(void) jl_errorf("fatal error: sigaction: %s", strerror(errno)); } // need to ensure the following signals are not SIG_IGN, even though they will be blocked - act_die.sa_flags = SA_SIGINFO | SA_RESTART; + act_die.sa_flags = SA_SIGINFO | SA_RESTART | SA_RESETHAND; #if defined(HAVE_ITIMER) if (sigaction(SIGPROF, &act_die, NULL) < 0) { jl_errorf("fatal error: sigaction: %s", strerror(errno)); diff --git a/src/signals-win.c b/src/signals-win.c index c871c59aa1316..ace5a178d483a 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -92,7 +92,7 @@ void __cdecl crt_sig_handler(int sig, int num) RtlCaptureContext(&Context); if (sig == SIGILL) jl_show_sigill(&Context); - jl_critical_error(sig, &Context, ptls->bt_data, &ptls->bt_size); + jl_critical_error(sig, &Context); raise(sig); } } @@ -309,8 +309,7 @@ LONG WINAPI jl_exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo) jl_safe_printf(" at 0x%Ix -- ", (size_t)ExceptionInfo->ExceptionRecord->ExceptionAddress); jl_print_native_codeloc((uintptr_t)ExceptionInfo->ExceptionRecord->ExceptionAddress); - jl_critical_error(0, ExceptionInfo->ContextRecord, - ptls->bt_data, &ptls->bt_size); + jl_critical_error(0, ExceptionInfo->ContextRecord); static int recursion = 0; if (recursion++) exit(1); @@ -384,10 +383,12 @@ JL_DLLEXPORT int jl_profile_start_timer(void) { if (hBtThread == NULL) { - if (MMSYSERR_NOERROR != timeGetDevCaps(&timecaps, sizeof(timecaps))) { + TIMECAPS _timecaps; + if (MMSYSERR_NOERROR != timeGetDevCaps(&_timecaps, sizeof(_timecaps))) { fputs("failed to get timer resolution", stderr); return -2; } + timecaps = _timecaps; hBtThread = CreateThread( NULL, // default security attributes diff --git a/src/stackwalk.c b/src/stackwalk.c index 9150d48b29765..52d95e8fe8de7 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -698,7 +698,10 @@ JL_DLLEXPORT void jl_gdblookup(void* ip) // Print backtrace for current exception in catch block JL_DLLEXPORT void jlbacktrace(void) JL_NOTSAFEPOINT { - jl_excstack_t *s = jl_get_ptls_states()->current_task->excstack; + jl_ptls_t ptls = jl_get_ptls_states(); + if (ptls->current_task == NULL) + return; + jl_excstack_t *s = ptls->current_task->excstack; if (!s) return; size_t bt_size = jl_excstack_bt_size(s, s->top); diff --git a/src/task.c b/src/task.c index 4d94d90fc62b3..d65e7412bb7ab 100644 --- a/src/task.c +++ b/src/task.c @@ -647,8 +647,9 @@ JL_DLLEXPORT void jl_rethrow(void) // Special case throw for errors detected inside signal handlers. This is not // (cannot be) called directly in the signal handler itself, but is returned to // after the signal handler exits. -JL_DLLEXPORT void jl_sig_throw(void) +JL_DLLEXPORT void JL_NORETURN jl_sig_throw(void) { +CFI_NORETURN jl_ptls_t ptls = jl_get_ptls_states(); jl_value_t *e = ptls->sig_exception; ptls->sig_exception = NULL; @@ -800,14 +801,7 @@ void jl_init_tasks(void) JL_GC_DISABLED STATIC_OR_JS void NOINLINE JL_NORETURN start_task(void) { -#ifdef _OS_WINDOWS_ -#if defined(_CPU_X86_64_) - // install the unhandled exception hanlder at the top of our stack - // to call directly into our personality handler - asm volatile ("\t.seh_handler __julia_personality, @except\n\t.text"); -#endif -#endif - +CFI_NORETURN // this runs the first time we switch to a task sanitizer_finish_switch_fiber(); jl_ptls_t ptls = jl_get_ptls_states();