diff --git a/src/gc.c b/src/gc.c
index dad5768732545..3701df108a0f4 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -3922,7 +3922,6 @@ void jl_gc_init(void)
     JL_MUTEX_INIT(&heapsnapshot_lock, "heapsnapshot_lock");
     JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
     uv_mutex_init(&page_profile_lock);
-    uv_mutex_init(&gc_perm_lock);
     uv_mutex_init(&gc_threads_lock);
     uv_cond_init(&gc_threads_cond);
     uv_sem_init(&gc_sweep_assists_needed, 0);
diff --git a/src/init.c b/src/init.c
index e482f8b77ee9b..37eb08f172be5 100644
--- a/src/init.c
+++ b/src/init.c
@@ -738,6 +738,8 @@ static void init_global_mutexes(void) {
     JL_MUTEX_INIT(&typecache_lock, "typecache_lock");
 }
 
+extern uv_mutex_t array_to_string_print_lock;
+
 JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
 {
     // initialize many things, in no particular order
@@ -747,6 +749,10 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
     // Make sure we finalize the tls callback before starting any threads.
     (void)jl_get_pgcstack();
 
+    // Initialize a few locks...
+    uv_mutex_init(&gc_perm_lock);
+    uv_mutex_init(&array_to_string_print_lock);
+
     // initialize backtraces
     jl_init_profile_lock();
 #ifdef _OS_WINDOWS_
@@ -773,6 +779,7 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
     jl_io_loop = uv_default_loop(); // this loop will internal events (spawning process etc.),
                                     // best to call this first, since it also initializes libuv
     jl_init_uv();
+    jl_init_threading();
     init_stdio();
     restore_fp_env();
     if (jl_options.handle_signals == JL_OPTIONS_HANDLE_SIGNALS_ON)
@@ -818,7 +825,6 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
     jl_init_rand();
     jl_init_runtime_ccall();
     jl_init_tasks();
-    jl_init_threading();
     jl_init_threadinginfra();
     if (jl_options.handle_signals == JL_OPTIONS_HANDLE_SIGNALS_ON)
         jl_install_default_signal_handlers();
@@ -855,8 +861,6 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
 
 void jl_init_heartbeat(void);
 
-extern uv_mutex_t array_to_string_print_lock;
-
 static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_task_t *ct)
 {
     JL_TIMING(JULIA_INIT, JULIA_INIT);
@@ -892,8 +896,9 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
     }
 
     if (jl_base_module == NULL) {
-        // nthreads > 1 requires code in Base
-        jl_atomic_store_relaxed(&jl_n_threads, 1);
+        const int num_min_mutator_threads = 1; // main thread
+        // nthreads > num_min_mutator_threads requires code in Base
+        jl_atomic_store_relaxed(&jl_n_threads, num_min_mutator_threads);
         jl_n_markthreads = 0;
         jl_n_sweepthreads = 0;
         jl_n_gcthreads = 0;
@@ -904,8 +909,6 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
     jl_start_gc_threads();
     uv_barrier_wait(&thread_init_done);
 
-    uv_mutex_init(&array_to_string_print_lock);
-
     jl_init_heartbeat();
 
     jl_gc_enable(1);
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 94b1f85112d7d..079661a59b8c3 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -206,6 +206,14 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA
 int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
 void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
 
+jl_task_t *jl_get_random_task(void) JL_NOTSAFEPOINT;
+void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT;
+extern volatile struct _jl_bt_element_t *bt_data_prof;
+extern volatile size_t bt_size_max;
+extern volatile size_t bt_size_cur;
+extern volatile int running;
+extern volatile int profile_all_tasks;
+
 // number of cycles since power-on
 static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
 {
diff --git a/src/partr.c b/src/partr.c
index 33631dc83c05a..03ab1d5ff688e 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -90,6 +90,23 @@ JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max, uint32_t unbias)
     return cong(max, -(uint64_t)-unbias, &ptls->rngseed);
 }
 
+jl_ptls_t jl_threadfun_preamble(void *arg, uint8_t state)
+{
+    jl_threadarg_t *targ = (jl_threadarg_t*)arg;
+    // initialize this thread (set tid and create heap)
+    jl_ptls_t ptls = jl_init_threadtls(targ->tid);
+    void *stack_lo, *stack_hi;
+    jl_init_stack_limits(0, &stack_lo, &stack_hi);
+    // warning: this changes `jl_current_task`, so be careful not to call that from this function
+    jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi);
+    JL_GC_PROMISE_ROOTED(ct);
+    // wait for all threads
+    jl_gc_state_set(ptls, state, 0);
+    uv_barrier_wait(targ->barrier);
+    free(targ);
+    return ptls;
+}
+
 // initialize the threading infrastructure
 // (called only by the main thread)
 void jl_init_threadinginfra(void)
@@ -123,19 +140,7 @@ void jl_parallel_gc_threadfun(void *arg)
 {
     jl_threadarg_t *targ = (jl_threadarg_t*)arg;
 
-    // initialize this thread (set tid and create heap)
-    jl_ptls_t ptls = jl_init_threadtls(targ->tid);
-    void *stack_lo, *stack_hi;
-    jl_init_stack_limits(0, &stack_lo, &stack_hi);
-    // warning: this changes `jl_current_task`, so be careful not to call that from this function
-    jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi);
-    JL_GC_PROMISE_ROOTED(ct);
-    // wait for all threads
-    jl_gc_state_set(ptls, JL_GC_PARALLEL_COLLECTOR_THREAD, 0);
-    uv_barrier_wait(targ->barrier);
-
-    // free the thread argument here
-    free(targ);
+    jl_ptls_t ptls = jl_threadfun_preamble(targ, JL_GC_PARALLEL_COLLECTOR_THREAD);
 
     while (1) {
         uv_mutex_lock(&gc_threads_lock);
@@ -158,19 +163,8 @@ void jl_concurrent_gc_threadfun(void *arg)
 {
     jl_threadarg_t *targ = (jl_threadarg_t*)arg;
 
-    // initialize this thread (set tid and create heap)
-    jl_ptls_t ptls = jl_init_threadtls(targ->tid);
-    void *stack_lo, *stack_hi;
-    jl_init_stack_limits(0, &stack_lo, &stack_hi);
-    // warning: this changes `jl_current_task`, so be careful not to call that from this function
-    jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi);
-    JL_GC_PROMISE_ROOTED(ct);
-    // wait for all threads
-    jl_gc_state_set(ptls, JL_GC_CONCURRENT_COLLECTOR_THREAD, 0);
-    uv_barrier_wait(targ->barrier);
-
-    // free the thread argument here
-    free(targ);
+    jl_ptls_t ptls = jl_threadfun_preamble(targ, JL_GC_CONCURRENT_COLLECTOR_THREAD);
+    (void)ptls;
 
     while (1) {
         assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_CONCURRENT_COLLECTOR_THREAD);
@@ -184,20 +178,8 @@ void jl_threadfun(void *arg)
 {
     jl_threadarg_t *targ = (jl_threadarg_t*)arg;
 
-    // initialize this thread (set tid, create heap, set up root task)
-    jl_ptls_t ptls = jl_init_threadtls(targ->tid);
-    void *stack_lo, *stack_hi;
-    jl_init_stack_limits(0, &stack_lo, &stack_hi);
-    // warning: this changes `jl_current_task`, so be careful not to call that from this function
-    jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi);
-    JL_GC_PROMISE_ROOTED(ct);
-
-    // wait for all threads
-    jl_gc_state_set(ptls, JL_GC_STATE_SAFE, 0);
-    uv_barrier_wait(targ->barrier);
-
-    // free the thread argument here
-    free(targ);
+    jl_ptls_t ptls = jl_threadfun_preamble(targ, JL_GC_STATE_SAFE);
+    jl_task_t *ct = jl_current_task;
 
     (void)jl_gc_unsafe_enter(ptls);
     jl_finish_task(ct); // noreturn
diff --git a/src/signal-handling.c b/src/signal-handling.c
index c2a344a752547..dfd025da65a5b 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -18,16 +18,16 @@ extern "C" {
 #include <threading.h>
 
 // Profiler control variables
-// Note: these "static" variables are also used in "signals-*.c"
-static volatile jl_bt_element_t *bt_data_prof = NULL;
-static volatile size_t bt_size_max = 0;
-static volatile size_t bt_size_cur = 0;
+volatile jl_bt_element_t *bt_data_prof = NULL;
+volatile size_t bt_size_max = 0;
+volatile size_t bt_size_cur = 0;
 static volatile uint64_t nsecprof = 0;
-static volatile int running = 0;
-static const    uint64_t GIGA = 1000000000ULL;
+volatile int running = 0;
+volatile int profile_all_tasks = 0;
+static const uint64_t GIGA = 1000000000ULL;
 // Timers to take samples at intervals
 JL_DLLEXPORT void jl_profile_stop_timer(void);
-JL_DLLEXPORT int jl_profile_start_timer(void);
+JL_DLLEXPORT int jl_profile_start_timer(uint8_t);
 // File-descriptor for safe logging on signal handling
 int jl_sig_fd;
 
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 6ec8f95570f17..5d6cd0d405a13 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -603,6 +603,85 @@ void jl_unlock_stackwalk(int lockret)
     jl_unlock_profile_mach(1, lockret);
 }
 
+// assumes holding `jl_lock_profile_mach`
+void jl_profile_thread_mach(int tid)
+{
+    // if there is no space left, return early
+    if (jl_profile_is_buffer_full()) {
+        jl_profile_stop_timer();
+        return;
+    }
+    if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
+        _dyld_dlopen_atfork_prepare();
+    if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
+        _dyld_atfork_prepare(); // briefly acquire the dlsym lock
+    host_thread_state_t state;
+    int valid_thread = jl_thread_suspend_and_get_state2(tid, &state);
+    unw_context_t *uc = (unw_context_t*)&state;
+    if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
+        _dyld_atfork_parent(); // quickly release the dlsym lock
+    if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
+        _dyld_dlopen_atfork_parent();
+    if (!valid_thread)
+        return;
+    if (running) {
+#ifdef LLVMLIBUNWIND
+        /*
+            *  Unfortunately compact unwind info is incorrectly generated for quite a number of
+            *  libraries by quite a large number of compilers. We can fall back to DWARF unwind info
+            *  in some cases, but in quite a number of cases (especially libraries not compiled in debug
+            *  mode, only the compact unwind info may be available). Even more unfortunately, there is no
+            *  way to detect such bogus compact unwind info (other than noticing the resulting segfault).
+            *  What we do here is ugly, but necessary until the compact unwind info situation improves.
+            *  We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
+            *  Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
+            *  entry will always be correct, and the number of cases in which this is an issue is rather small.
+            *  Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
+            *  and during stack unwinding we only ever read memory, but never write it.
+            */
+
+        forceDwarf = 0;
+        unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point
+
+        if (forceDwarf == 0) {
+            // Save the backtrace
+            bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
+        }
+        else if (forceDwarf == 1) {
+            bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
+        }
+        else if (forceDwarf == -1) {
+            jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
+        }
+
+        forceDwarf = -2;
+#else
+        bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
+#endif
+        jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+
+        // store threadid but add 1 as 0 is preserved to indicate end of block
+        bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
+
+        // store task id (never null)
+        bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
+
+        // store cpu cycle clock
+        bt_data_prof[bt_size_cur++].uintptr = cycleclock();
+
+        // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
+        bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
+
+        // Mark the end of this block with two 0's
+        bt_data_prof[bt_size_cur++].uintptr = 0;
+        bt_data_prof[bt_size_cur++].uintptr = 0;
+    }
+    // We're done! Resume the thread.
+    jl_thread_resume(tid);
+}
+
+void jl_profile_task_unix(size_t nthreads);
+
 void *mach_profile_listener(void *arg)
 {
     (void)arg;
@@ -620,85 +699,18 @@ void *mach_profile_listener(void *arg)
         // sample each thread, round-robin style in reverse order
         // (so that thread zero gets notified last)
         int keymgr_locked = jl_lock_profile_mach(0);
-
         int nthreads = jl_atomic_load_acquire(&jl_n_threads);
-        int *randperm = profile_get_randperm(nthreads);
-        for (int idx = nthreads; idx-- > 0; ) {
-            // Stop the threads in the random or reverse round-robin order.
-            int i = randperm[idx];
-            // if there is no space left, break early
-            if (jl_profile_is_buffer_full()) {
-                jl_profile_stop_timer();
-                break;
-            }
-
-            if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
-                _dyld_dlopen_atfork_prepare();
-            if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
-                _dyld_atfork_prepare(); // briefly acquire the dlsym lock
-            host_thread_state_t state;
-            int valid_thread = jl_thread_suspend_and_get_state2(i, &state);
-            unw_context_t *uc = (unw_context_t*)&state;
-            if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
-                _dyld_atfork_parent(); // quickly release the dlsym lock
-            if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
-                _dyld_dlopen_atfork_parent();
-            if (!valid_thread)
-                continue;
-            if (running) {
-#ifdef LLVMLIBUNWIND
-                /*
-                 *  Unfortunately compact unwind info is incorrectly generated for quite a number of
-                 *  libraries by quite a large number of compilers. We can fall back to DWARF unwind info
-                 *  in some cases, but in quite a number of cases (especially libraries not compiled in debug
-                 *  mode, only the compact unwind info may be available). Even more unfortunately, there is no
-                 *  way to detect such bogus compact unwind info (other than noticing the resulting segfault).
-                 *  What we do here is ugly, but necessary until the compact unwind info situation improves.
-                 *  We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
-                 *  Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
-                 *  entry will always be correct, and the number of cases in which this is an issue is rather small.
-                 *  Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
-                 *  and during stack unwinding we only ever read memory, but never write it.
-                 */
-
-                forceDwarf = 0;
-                unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point
-
-                if (forceDwarf == 0) {
-                    // Save the backtrace
-                    bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
-                }
-                else if (forceDwarf == 1) {
-                    bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
-                }
-                else if (forceDwarf == -1) {
-                    jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
-                }
-
-                forceDwarf = -2;
-#else
-                bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
-#endif
-                jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i];
-
-                // store threadid but add 1 as 0 is preserved to indicate end of block
-                bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
-
-                // store task id (never null)
-                bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
-
-                // store cpu cycle clock
-                bt_data_prof[bt_size_cur++].uintptr = cycleclock();
-
-                // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
-                bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
-
-                // Mark the end of this block with two 0's
-                bt_data_prof[bt_size_cur++].uintptr = 0;
-                bt_data_prof[bt_size_cur++].uintptr = 0;
+        if (profile_all_tasks) {
+            // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace`
+            jl_profile_task_unix(nthreads);
+        }
+        else {
+            int *randperm = profile_get_randperm(nthreads);
+            for (int idx = nthreads; idx-- > 0; ) {
+                // Stop the threads in random order.
+                int i = randperm[idx];
+                jl_profile_thread_mach(i);
             }
-            // We're done! Resume the thread.
-            jl_thread_resume(i);
         }
         jl_unlock_profile_mach(0, keymgr_locked);
         if (running) {
@@ -710,7 +722,8 @@ void *mach_profile_listener(void *arg)
     }
 }
 
-JL_DLLEXPORT int jl_profile_start_timer(void)
+
+JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
 {
     kern_return_t ret;
     if (!profile_started) {
@@ -740,6 +753,7 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
     timerprof.tv_nsec = nsecprof%GIGA;
 
     running = 1;
+    profile_all_tasks = all_tasks;
     // ensure the alarm is running
     ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port);
     HANDLE_MACH_ERROR("clock_alarm", ret);
@@ -750,4 +764,5 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
 JL_DLLEXPORT void jl_profile_stop_timer(void)
 {
     running = 0;
+    profile_all_tasks = 0;
 }
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 3ebf7954dccfc..db425a082c72e 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -9,6 +9,10 @@
 #include <pthread.h>
 #include <time.h>
 #include <errno.h>
+
+#include "julia.h"
+#include "julia_internal.h"
+
 #if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS)
 #define MAP_ANONYMOUS MAP_ANON
 #endif
@@ -562,7 +566,7 @@ int timer_graceperiod_elapsed(void)
 static timer_t timerprof;
 static struct itimerspec itsprof;
 
-JL_DLLEXPORT int jl_profile_start_timer(void)
+JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
 {
     struct sigevent sigprof;
 
@@ -573,8 +577,10 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
     sigprof.sigev_value.sival_ptr = &timerprof;
     // Because SIGUSR1 is multipurpose, set `running` before so that we know that the first SIGUSR1 came from the timer
     running = 1;
+    profile_all_tasks = all_tasks;
     if (timer_create(CLOCK_REALTIME, &sigprof, &timerprof) == -1) {
         running = 0;
+        profile_all_tasks = 0;
         return -2;
     }
 
@@ -585,6 +591,7 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
     itsprof.it_value.tv_nsec = nsecprof % GIGA;
     if (timer_settime(timerprof, 0, &itsprof, NULL) == -1) {
         running = 0;
+        profile_all_tasks = 0;
         return -3;
     }
     return 0;
@@ -700,12 +707,97 @@ void trigger_profile_peek(void)
         }
     }
     bt_size_cur = 0; // clear profile buffer
-    if (jl_profile_start_timer() < 0)
+    if (jl_profile_start_timer(0) < 0)
         jl_safe_printf("ERROR: Could not start profile timer\n");
     else
         profile_autostop_time = jl_hrtime() + (profile_peek_duration * 1e9);
 }
 
+void jl_profile_task_unix(size_t nthreads)
+{
+    if (jl_profile_is_buffer_full()) {
+        // Buffer full: Delete the timer
+        jl_profile_stop_timer();
+        return;
+    }
+
+    jl_task_t *t = jl_get_random_task();
+    assert(t == NULL || jl_is_task(t));
+    if (t == NULL) {
+        return;
+    }
+    int t_state = jl_atomic_load_relaxed(&t->_state);
+    if (t_state == JL_TASK_STATE_DONE) {
+        return;
+    }
+
+    jl_rec_backtrace(t);
+
+    // store threadid but add 1 as 0 is preserved to indicate end of block
+    bt_data_prof[bt_size_cur++].uintptr = 1; // dummy value for now... Is this ever used when outputting the profile?
+
+    // store task id (never null)
+    bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)t;
+
+    // store cpu cycle clock. XXX(Diogo, Nick): why are we recording the cycleclock here?
+    bt_data_prof[bt_size_cur++].uintptr = cycleclock();
+
+    // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
+    bt_data_prof[bt_size_cur++].uintptr = 1; // dummy value for now... Is this ever used when outputting the profile?
+
+    // Mark the end of this block with two 0's
+    bt_data_prof[bt_size_cur++].uintptr = 0;
+    bt_data_prof[bt_size_cur++].uintptr = 0;
+}
+
+// assumes holding `jl_lock_stackwalk`
+void jl_profile_thread_unix(int tid, bt_context_t *signal_context)
+{
+    if (jl_profile_is_buffer_full()) {
+        // Buffer full: Delete the timer
+        jl_profile_stop_timer();
+        return;
+    }
+    // notify thread to stop
+    if (!jl_thread_suspend_and_get_state(tid, 1, signal_context))
+        return;
+    // unwinding can fail, so keep track of the current state
+    // and restore from the SEGV handler if anything happens.
+    jl_jmp_buf *old_buf = jl_get_safe_restore();
+    jl_jmp_buf buf;
+
+    jl_set_safe_restore(&buf);
+    if (jl_setjmp(buf, 0)) {
+        jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
+    } else {
+        // Get backtrace data
+        bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
+                bt_size_max - bt_size_cur - 1, signal_context, NULL);
+    }
+    jl_set_safe_restore(old_buf);
+
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+
+    // store threadid but add 1 as 0 is preserved to indicate end of block
+    bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1;
+
+    // store task id (never null)
+    bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task);
+
+    // store cpu cycle clock
+    bt_data_prof[bt_size_cur++].uintptr = cycleclock();
+
+    // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
+    bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1;
+
+    // Mark the end of this block with two 0's
+    bt_data_prof[bt_size_cur++].uintptr = 0;
+    bt_data_prof[bt_size_cur++].uintptr = 0;
+
+    // notify thread to resume
+    jl_thread_resume(tid);
+}
+
 static void *signal_listener(void *arg)
 {
     static jl_bt_element_t bt_data[JL_MAX_BT_SIZE + 1];
@@ -845,76 +937,44 @@ static void *signal_listener(void *arg)
         bt_size = 0;
 #if !defined(JL_DISABLE_LIBUNWIND)
         bt_context_t signal_context;
-        // sample each thread, round-robin style in reverse order
-        // (so that thread zero gets notified last)
-        if (critical || profile) {
+        if (critical) {
             int lockret = jl_lock_stackwalk();
-            int *randperm;
-            if (profile)
-                 randperm = profile_get_randperm(nthreads);
-            for (int idx = nthreads; idx-- > 0; ) {
-                // Stop the threads in the random or reverse round-robin order.
-                int i = profile ? randperm[idx] : idx;
+            // sample each thread, round-robin style in reverse order
+            // (so that thread zero gets notified last)
+            for (int i = nthreads; i-- > 0; ) {
                 // notify thread to stop
                 if (!jl_thread_suspend_and_get_state(i, 1, &signal_context))
                     continue;
 
                 // do backtrace on thread contexts for critical signals
                 // this part must be signal-handler safe
-                if (critical) {
-                    bt_size += rec_backtrace_ctx(bt_data + bt_size,
-                            JL_MAX_BT_SIZE / nthreads - 1,
-                            &signal_context, NULL);
-                    bt_data[bt_size++].uintptr = 0;
-                }
-
-                // do backtrace for profiler
-                if (profile && running) {
-                    if (jl_profile_is_buffer_full()) {
-                        // Buffer full: Delete the timer
-                        jl_profile_stop_timer();
-                    }
-                    else {
-                        // unwinding can fail, so keep track of the current state
-                        // and restore from the SEGV handler if anything happens.
-                        jl_jmp_buf *old_buf = jl_get_safe_restore();
-                        jl_jmp_buf buf;
-
-                        jl_set_safe_restore(&buf);
-                        if (jl_setjmp(buf, 0)) {
-                            jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
-                        } else {
-                            // Get backtrace data
-                            bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
-                                    bt_size_max - bt_size_cur - 1, &signal_context, NULL);
-                        }
-                        jl_set_safe_restore(old_buf);
-
-                        jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i];
-
-                        // store threadid but add 1 as 0 is preserved to indicate end of block
-                        bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1;
-
-                        // store task id (never null)
-                        bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task);
-
-                        // store cpu cycle clock
-                        bt_data_prof[bt_size_cur++].uintptr = cycleclock();
-
-                        // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
-                        bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1;
-
-                        // Mark the end of this block with two 0's
-                        bt_data_prof[bt_size_cur++].uintptr = 0;
-                        bt_data_prof[bt_size_cur++].uintptr = 0;
-                    }
-                }
-
-                // notify thread to resume
+                bt_size += rec_backtrace_ctx(bt_data + bt_size,
+                        JL_MAX_BT_SIZE / nthreads - 1,
+                        &signal_context, NULL);
+                bt_data[bt_size++].uintptr = 0;
                 jl_thread_resume(i);
             }
             jl_unlock_stackwalk(lockret);
         }
+        else if (profile) {
+            if (profile_all_tasks) {
+                // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace`
+                jl_profile_task_unix(nthreads);
+            }
+            else {
+                int lockret = jl_lock_stackwalk();
+                int *randperm = profile_get_randperm(nthreads);
+                for (int idx = nthreads; idx-- > 0; ) {
+                    // Stop the threads in the random order.
+                    int i = randperm[idx];
+                    // do backtrace for profiler
+                    if (profile && running) {
+                        jl_profile_thread_unix(i, &signal_context);
+                    }
+                }
+                jl_unlock_stackwalk(lockret);
+            }
+        }
 #ifndef HAVE_MACH
         if (profile && running) {
             jl_check_profile_autostop();
diff --git a/src/signals-win.c b/src/signals-win.c
index bcb3a1fd246f0..ff0b1269050ff 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -449,7 +449,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
 
 static volatile TIMECAPS timecaps;
 
-JL_DLLEXPORT int jl_profile_start_timer(void)
+JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
 {
     if (hBtThread == NULL) {
 
@@ -483,6 +483,7 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
         if (TIMERR_NOERROR != timeBeginPeriod(timecaps.wPeriodMin))
             timecaps.wPeriodMin = 0;
     }
+    profile_all_tasks = all_tasks;
     running = 1; // set `running` finally
     return 0;
 }
@@ -491,6 +492,7 @@ JL_DLLEXPORT void jl_profile_stop_timer(void)
     if (running && timecaps.wPeriodMin)
         timeEndPeriod(timecaps.wPeriodMin);
     running = 0;
+    profile_all_tasks = 0;
 }
 
 void jl_install_default_signal_handlers(void)
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 8a12fb2a28143..664e2ff9b3a48 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -868,21 +868,38 @@ _os_ptr_munge(uintptr_t ptr)
 #endif
 
 
-extern bt_context_t *jl_to_bt_context(void *sigctx);
+STATIC_INLINE int all_tasks_profile_running(void)
+{
+    return running && profile_all_tasks;
+}
 
-static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
+void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
 {
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    ptls->bt_size = 0;
+    jl_task_t *ct = NULL;
+    jl_ptls_t ptls = NULL;
+    int16_t tid = INT16_MAX;
+    if (!all_tasks_profile_running()) {
+        jl_task_t *ct = jl_current_task;
+        jl_ptls_t ptls = ct->ptls;
+        ptls->bt_size = 0;
+        tid = ptls->tid;
+    }
     if (t == ct) {
-        ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
-        return;
+        // Record into the profile buffer
+        if (all_tasks_profile_running()) {
+            bt_size_cur += rec_backtrace((jl_bt_element_t*)bt_data_prof + bt_size_cur,
+                                          bt_size_max - bt_size_cur - 1, 0);
+            return;
+        }
+        else {
+            ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
+            return;
+        }
     }
     bt_context_t *context = NULL;
     bt_context_t c;
     int16_t old = -1;
-    while (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) {
+    while (!jl_atomic_cmpswap(&t->tid, &old, tid) && old != tid) {
         int lockret = jl_lock_stackwalk();
         // if this task is already running somewhere, we need to stop the thread it is running on and query its state
         if (!jl_thread_suspend_and_get_state(old, 0, &c)) {
@@ -1109,11 +1126,21 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
      #pragma message("jl_rec_backtrace not defined for unknown task system")
 #endif
     }
-    if (context)
-        ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context,  t->gcstack);
+    if (context) {
+        // Record into the profile buffer
+        if (all_tasks_profile_running()) {
+            bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
+                                              bt_size_max - bt_size_cur - 1, context, NULL);
+        }
+        // Record into the buffer owned by the threads's TLS
+        else {
+            ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE,
+                                              context, t->gcstack);
+        }
+    }
     if (old == -1)
         jl_atomic_store_relaxed(&t->tid, old);
-    else if (old != ptls->tid)
+    else if (old != tid)
         jl_thread_resume(old);
 }
 
diff --git a/src/task.c b/src/task.c
index 86033a81ddf41..c9eb5f14efb99 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1115,6 +1115,46 @@ JL_DLLEXPORT jl_task_t *jl_get_current_task(void)
     return pgcstack == NULL ? NULL : container_of(pgcstack, jl_task_t, gcstack);
 }
 
+extern int gc_first_tid;
+
+// Select a task at random to profile. Racy: `live_tasks` can change at any time.
+jl_task_t *jl_get_random_task(void) JL_NOTSAFEPOINT
+{
+    arraylist_t tasks;
+    arraylist_new(&tasks, 0);
+    size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (size_t i = 0; i < nthreads; i++) {
+        // skip GC threads...
+        if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) {
+            continue;
+        }
+        jl_ptls_t ptls2 = allstates[i];
+        if (ptls2 == NULL) {
+            continue;
+        }
+        jl_task_t *t = ptls2->root_task;
+        if (t->stkbuf != NULL) {
+            arraylist_push(&tasks, t);
+        }
+        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        size_t n = mtarraylist_length(live_tasks);
+        for (size_t i = 0; i < n; i++) {
+            jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
+            if (t->stkbuf != NULL) {
+                arraylist_push(&tasks, t);
+            }
+        }
+    }
+    size_t n = tasks.len;
+    if (n == 0) {
+        return NULL;
+    }
+    jl_task_t *t = (jl_task_t*)tasks.items[jl_rand() % n];
+    arraylist_free(&tasks);
+    return t;
+}
+
 
 #ifdef JL_HAVE_ASYNCIFY
 JL_DLLEXPORT jl_ucontext_t *task_ctx_ptr(jl_task_t *t)
diff --git a/src/threading.c b/src/threading.c
index 8f350d41f64b1..245cfb6ce19e3 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -710,6 +710,8 @@ void jl_init_threading(void)
     jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size);
     jl_n_gcthreads = ngcthreads;
     gc_first_tid = nthreads + nthreadsi;
+
+    uv_barrier_init(&thread_init_done, jl_all_tls_states_size);
 }
 
 uv_barrier_t thread_init_done;
@@ -747,16 +749,12 @@ void jl_start_threads(void)
         mask[0] = 0;
     }
 
-    // create threads
-    uv_barrier_init(&thread_init_done, nthreads);
-
+    // Create threads
     // GC/System threads need to be after the worker threads.
     int nmutator_threads = nthreads - ngcthreads;
 
     for (i = 1; i < nmutator_threads; ++i) {
-        jl_threadarg_t *t = (jl_threadarg_t *)malloc_s(sizeof(jl_threadarg_t)); // ownership will be passed to the thread
-        t->tid = i;
-        t->barrier = &thread_init_done;
+        jl_threadarg_t *t = jl_threadarg_new(i, &thread_init_done, NULL);
         uv_thread_create(&uvtid, jl_threadfun, t);
         if (exclusive) {
             mask[i] = 1;
diff --git a/src/threading.h b/src/threading.h
index cb26537699713..22b267b73ed38 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -22,10 +22,20 @@ typedef struct _jl_threadarg_t {
     void *arg;
 } jl_threadarg_t;
 
+STATIC_INLINE jl_threadarg_t *jl_threadarg_new(int16_t tid, uv_barrier_t *barrier, void *arg)
+{
+    jl_threadarg_t *targ = (jl_threadarg_t*)malloc_s(sizeof(jl_threadarg_t));
+    targ->tid = tid;
+    targ->barrier = barrier;
+    targ->arg = arg;
+    return targ;
+}
+
 // each thread must initialize its TLS
 jl_ptls_t jl_init_threadtls(int16_t tid) JL_NOTSAFEPOINT;
 
 // provided by a threading infrastructure
+jl_ptls_t jl_threadfun_preamble(void *arg, uint8_t state);
 void jl_init_threadinginfra(void);
 void jl_parallel_gc_threadfun(void *arg);
 void jl_concurrent_gc_threadfun(void *arg);
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index a0a1ecd1964ed..63a1f3a4517d9 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -31,6 +31,25 @@ macro profile(ex)
     end
 end
 
+export @profile_all
+
+"""
+    @profile_all
+
+`@profile_all <expression>` runs your expression while taking periodic backtraces of a sample of all live tasks (both running and not running).
+These are appended to an internal buffer of backtraces.
+"""
+macro profile_all(ex)
+    return quote
+        try
+            start_timer(true)
+            $(esc(ex))
+        finally
+            stop_timer()
+        end
+    end
+end
+
 # An internal function called to show the report after an information request (SIGINFO or SIGUSR1).
 function _peek_report()
     iob = IOBuffer()
@@ -562,9 +581,9 @@ Julia, and examine the resulting `*.mem` files.
 clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ())
 
 # C wrappers
-function start_timer()
+function start_timer(all_tasks::Bool=false)
     check_init() # if the profile buffer hasn't been initialized, initialize with default size
-    status = ccall(:jl_profile_start_timer, Cint, ())
+    status = ccall(:jl_profile_start_timer, Cint, (Bool,), all_tasks)
     if status < 0
         error(error_codes[status])
     end