Skip to content

Commit

Permalink
i#4030: Add static TLS support to the Windows private loader (#4035)
Browse files Browse the repository at this point in the history
Adds static TLS support to the Windows private loader.
This involves the following:

Swaps the TEB->ThreadLocalStoragePointer field between app and private
states, with new dcontext fields to store the values.

Adds an os_privmod_data_t to the Windows loader to store TLS data.
Parses PE TLS data fields, records the TLS callbacks, sets the TLS
array index, and records the TLS initialization data.  Calls TLS
callbacks prior to regular entry points; warns on crashes but does not
consider them fatal.

Handling TLS is simplified with a hardcoded maximum size and lack of
support for dynamicaly-loaded-library TLS.  This lets us allocate an
array at thread init and not need any complex scheme to reallocate it.
Prior to calling the thread init function for a library, its TLS data
is allocated and initialized from its recorded initialization data.

Since both process init and process exit library functions expect TLS
to be set up, reorders several loader sequences:

+ Partially unifies Windows and Linux by splitting loader_init() into
  loader_init_prologue() (for setting up the private PEB used by
  arch_init for gencode), called early, and loader_init_epilogue(),
  called after thread init so we have a dcontext and can set up the
  TLS.  However, Linux still needs relocs and TLS *after* thread init
  while Windows is the reverse, so we have a split ordering in a
  newly-divided loader_init_{prologue,epilogue}().
  This undoes the #338 special casing which is now removed.

+ Splits out instrument_exit_event() from instrument_exit() and moves
  instrument_exit() to after the final thread exit, to ensure we call
  the thread exit library functions.  Removes
  instrument_exit_post_sideline() which is now merged into
  instrument_exit().

+ Adds loader_make_exit_calls() to enable calling both the thread exit
  and process exit library functions before TLS is freed (yes, process
  exit functions blindly de-reference TLS, just like process init
  functions do).

+ Delays process and thread init function calling until the
  statically-imported set of libraries is fully loaded, so we have that
  TLS array size.

Reverses the modlist iteration order for function calling, to properly
call independent libraries first before their dependents.

Adds static TLS tests to the client.raw_tls test.  Renames the test to
client.tls, and changes the client to C++ to use std::vector and a
custom class with the C++11 'thread_local' for TLS callback testing.
However, the vector is disabled for Linux because it breaks that
loader: i#4034.  Plus, VS2013 doesn't support 'thread_local', so that
is disabled until we upgrade to VS2017.

Issue: #338, #4002, #4030, #4034
Fixes #4030
Fixes #4002
  • Loading branch information
derekbruening authored Jan 21, 2020
1 parent 0a3c7f4 commit 9293e7a
Show file tree
Hide file tree
Showing 23 changed files with 728 additions and 207 deletions.
5 changes: 4 additions & 1 deletion core/arch/arch.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2010-2019 Google, Inc. All rights reserved.
* Copyright (c) 2010-2020 Google, Inc. All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* **********************************************************/

Expand Down Expand Up @@ -184,6 +184,9 @@ mixed_mode_enabled(void)
# define PRIV_RPC_OFFSET ((PROT_OFFS) + offsetof(dcontext_t, priv_nt_rpc))
# define APP_NLS_CACHE_OFFSET ((PROT_OFFS) + offsetof(dcontext_t, app_nls_cache))
# define PRIV_NLS_CACHE_OFFSET ((PROT_OFFS) + offsetof(dcontext_t, priv_nls_cache))
# define APP_STATIC_TLS_OFFSET ((PROT_OFFS) + offsetof(dcontext_t, app_static_tls))
# define PRIV_STATIC_TLS_OFFSET \
((PROT_OFFS) + offsetof(dcontext_t, priv_static_tls))
# endif
# define APP_STACK_LIMIT_OFFSET ((PROT_OFFS) + offsetof(dcontext_t, app_stack_limit))
# define APP_STACK_BASE_OFFSET ((PROT_OFFS) + offsetof(dcontext_t, app_stack_base))
Expand Down
61 changes: 40 additions & 21 deletions core/arch/emit_utils_shared.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2010-2019 Google, Inc. All rights reserved.
* Copyright (c) 2010-2020 Google, Inc. All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* **********************************************************/

Expand Down Expand Up @@ -1743,26 +1743,6 @@ preinsert_swap_peb(dcontext_t *dcontext, instrlist_t *ilist, instr_t *next, bool
0, ERRNO_TIB_OFFSET, OPSZ_4),
opnd_create_reg(scratch32)));
}
/* We also swap TEB->NlsCache. Unlike TEB->ProcessEnvironmentBlock, which is
* constant, and TEB->LastErrorCode, which is not peristent, we have to maintain
* both values and swap between them which is expensive.
*/
PRE(ilist, next,
XINST_CREATE_load(dcontext, opnd_create_reg(reg_scratch),
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, 0,
NLS_CACHE_TIB_OFFSET, OPSZ_PTR)));
PRE(ilist, next,
SAVE_TO_DC_VIA_REG(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? APP_NLS_CACHE_OFFSET : PRIV_NLS_CACHE_OFFSET));
PRE(ilist, next,
RESTORE_FROM_DC_VIA_REG(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? PRIV_NLS_CACHE_OFFSET
: APP_NLS_CACHE_OFFSET));
PRE(ilist, next,
XINST_CREATE_store(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, 0,
NLS_CACHE_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
/* We also swap TEB->FlsData. Unlike TEB->ProcessEnvironmentBlock, which is
* constant, and TEB->LastErrorCode, which is not peristent, we have to maintain
* both values and swap between them which is expensive.
Expand Down Expand Up @@ -1800,6 +1780,45 @@ preinsert_swap_peb(dcontext_t *dcontext, instrlist_t *ilist, instr_t *next, bool
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, 0,
NT_RPC_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
/* We also swap TEB->NlsCache. */
PRE(ilist, next,
XINST_CREATE_load(dcontext, opnd_create_reg(reg_scratch),
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, 0,
NLS_CACHE_TIB_OFFSET, OPSZ_PTR)));
PRE(ilist, next,
SAVE_TO_DC_VIA_REG(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? APP_NLS_CACHE_OFFSET : PRIV_NLS_CACHE_OFFSET));
PRE(ilist, next,
RESTORE_FROM_DC_VIA_REG(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? PRIV_NLS_CACHE_OFFSET
: APP_NLS_CACHE_OFFSET));
PRE(ilist, next,
XINST_CREATE_store(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, 0,
NLS_CACHE_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
/* We also have to swap TEB->ThreadLocalStoragePointer. Unlike the other
* fields, we control this private one so we never set it from the TEB field.
*/
if (to_priv) {
PRE(ilist, next,
XINST_CREATE_load(dcontext, opnd_create_reg(reg_scratch),
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL,
0, STATIC_TLS_TIB_OFFSET,
OPSZ_PTR)));
PRE(ilist, next,
SAVE_TO_DC_VIA_REG(absolute, dcontext, reg_dr, reg_scratch,
APP_STATIC_TLS_OFFSET));
}
PRE(ilist, next,
RESTORE_FROM_DC_VIA_REG(absolute, dcontext, reg_dr, reg_scratch,
to_priv ? PRIV_STATIC_TLS_OFFSET
: APP_STATIC_TLS_OFFSET));
PRE(ilist, next,
XINST_CREATE_store(dcontext,
opnd_create_far_base_disp(SEG_TLS, REG_NULL, REG_NULL, 0,
STATIC_TLS_TIB_OFFSET, OPSZ_PTR),
opnd_create_reg(reg_scratch)));
}
# endif /* CLIENT_INTERFACE */
}
Expand Down
5 changes: 4 additions & 1 deletion core/dispatch.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2011-2019 Google, Inc. All rights reserved.
* Copyright (c) 2011-2020 Google, Inc. All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* **********************************************************/

Expand Down Expand Up @@ -1016,6 +1016,7 @@ dispatch_exit_fcache(dcontext_t *dcontext)
ASSERT(dcontext->app_nt_rpc == NULL ||
dcontext->app_nt_rpc != dcontext->priv_nt_rpc);
ASSERT(!is_dynamo_address(dcontext->app_nls_cache));
ASSERT(!is_dynamo_address(dcontext->app_static_tls));
ASSERT(!is_dynamo_address(dcontext->app_stack_limit) ||
IS_CLIENT_THREAD(dcontext));
ASSERT(!is_dynamo_address((byte *)dcontext->app_stack_base - 1) ||
Expand All @@ -1039,6 +1040,8 @@ dispatch_exit_fcache(dcontext_t *dcontext)
get_mcontext(dcontext)->xsp >= (reg_t)d_r_get_tls(BASE_STACK_TIB_OFFSET)));
ASSERT(dcontext->app_nls_cache == NULL ||
dcontext->app_nls_cache != dcontext->priv_nls_cache);
ASSERT(dcontext->app_static_tls == NULL ||
dcontext->app_static_tls != dcontext->priv_static_tls);
}
#endif

Expand Down
42 changes: 25 additions & 17 deletions core/dynamo.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2010-2019 Google, Inc. All rights reserved.
* Copyright (c) 2010-2020 Google, Inc. All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* **********************************************************/

Expand Down Expand Up @@ -557,13 +557,11 @@ dynamorio_app_init(void)
}
#endif /* WINDOWS */

#ifdef WINDOWS
/* loader initialization, finalize the private lib load.
* i#338: this must be before d_r_arch_init() for Windows, but Linux
* wants it later (i#2751).
/* Set up any private-loader-related data we need before generating any
* code, such as the private PEB on Windows.
*/
loader_init();
#endif
loader_init_prologue();

d_r_arch_init();
synch_init();

Expand Down Expand Up @@ -632,10 +630,8 @@ dynamorio_app_init(void)
* require changing start/stop API
*/
dynamo_thread_init(NULL, NULL, NULL _IF_CLIENT_INTERFACE(false));
#ifndef WINDOWS
/* i#2751: we need TLS to be set up to relocate and call init funcs. */
loader_init();
#endif
loader_init_epilogue(get_thread_private_dcontext());

/* We move vm_areas_init() below dynamo_thread_init() so we can have
* two things: 1) a dcontext and 2) a SIGSEGV handler, for TRY/EXCEPT
Expand Down Expand Up @@ -1021,17 +1017,15 @@ dynamo_shared_exit(thread_record_t *toexit /* must ==cur thread for Linux */
* fragment_deleted() callbacks (xref PR 228156). FIXME - might be issues with the
* client trying to use api routines that depend on fragment state.
*/
instrument_exit();
instrument_exit_event();
# ifdef CLIENT_SIDELINE
/* We only need do a second synch-all if there are sideline client threads. */
if (d_r_get_num_threads() > 1)
synch_with_threads_at_exit(exit_synch_state(), false /*post-exit*/);
/* only current thread is alive */
dynamo_exited_all_other_threads = true;
# endif /* CLIENT_SIDELINE */
/* Some lock can only be deleted if only one thread left. */
instrument_exit_post_sideline();
#endif /* CLIENT_INTERFACE */
#endif /* CLIENT_INTERFACE */
fragment_exit_post_sideline();

/* The dynamo_exited_and_cleaned should be set after the second synch-all.
Expand All @@ -1044,9 +1038,17 @@ dynamo_shared_exit(thread_record_t *toexit /* must ==cur thread for Linux */
destroy_event(dr_app_started);
destroy_event(dr_attach_finished);

/* Make thread and process exit calls before we clean up thread data. */
loader_make_exit_calls(get_thread_private_dcontext());
/* we want dcontext around for loader_exit() */
if (get_thread_private_dcontext() != NULL)
loader_thread_exit(get_thread_private_dcontext());
#ifdef CLIENT_INTERFACE
/* This will unload client libs, which we delay until after they receive their
* thread exit calls in loader_thread_exit().
*/
instrument_exit();
#endif
loader_exit();

if (toexit != NULL) {
Expand Down Expand Up @@ -1524,22 +1526,26 @@ dynamo_process_exit(void)
* fragment_deleted() callbacks (xref PR 228156). FIXME - might be issues
* with the client trying to use api routines that depend on fragment state.
*/
instrument_exit();
instrument_exit_event();

# ifdef CLIENT_SIDELINE
/* We only need do a second synch-all if there are sideline client threads. */
if (d_r_get_num_threads() > 1)
synch_with_threads_at_exit(exit_synch_state(), false /*post-exit*/);
dynamo_exited_all_other_threads = true;
# endif
/* Some lock can only be deleted if one thread left. */
instrument_exit_post_sideline();

/* i#1617: We need to call client library fini routines for global
* destructors, etc.
*/
if (!INTERNAL_OPTION(nullcalls) && !DYNAMO_OPTION(skip_thread_exit_at_exit))
loader_thread_exit(get_thread_private_dcontext());
# ifdef CLIENT_INTERFACE
/* This will unload client libs, which we delay until after they receive their
* thread exit calls in loader_thread_exit().
*/
instrument_exit();
# endif
loader_exit();

/* for -private_loader we do this here to catch more exit-time crashes */
Expand Down Expand Up @@ -1864,6 +1870,8 @@ create_callback_dcontext(dcontext_t *old_dcontext)
new_dcontext->priv_nt_rpc = old_dcontext->priv_nt_rpc;
new_dcontext->app_nls_cache = old_dcontext->app_nls_cache;
new_dcontext->priv_nls_cache = old_dcontext->priv_nls_cache;
new_dcontext->app_static_tls = old_dcontext->app_static_tls;
new_dcontext->priv_static_tls = old_dcontext->priv_static_tls;
# endif
new_dcontext->app_stack_limit = old_dcontext->app_stack_limit;
new_dcontext->app_stack_base = old_dcontext->app_stack_base;
Expand Down
4 changes: 3 additions & 1 deletion core/globals.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2011-2019 Google, Inc. All rights reserved.
* Copyright (c) 2011-2020 Google, Inc. All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* **********************************************************/

Expand Down Expand Up @@ -833,6 +833,8 @@ struct _dcontext_t {
void *priv_nt_rpc;
void *app_nls_cache;
void *priv_nls_cache;
void *app_static_tls;
void *priv_static_tls;
# endif
void *app_stack_limit;
void *app_stack_base;
Expand Down
19 changes: 10 additions & 9 deletions core/lib/instrument.c
Original file line number Diff line number Diff line change
Expand Up @@ -861,15 +861,7 @@ free_all_callback_lists()
}

void
instrument_exit_post_sideline(void)
{
# if defined(WINDOWS) || defined(CLIENT_SIDELINE)
DELETE_LOCK(client_thread_count_lock);
# endif
}

void
instrument_exit(void)
instrument_exit_event(void)
{
/* Note - currently own initexit lock when this is called (see PR 227619). */

Expand All @@ -880,6 +872,12 @@ instrument_exit(void)
/* It seems the compiler is confused if we pass no var args
* to the call_all macro. Bogus NULL arg */
NULL);
}

void
instrument_exit(void)
{
/* Note - currently own initexit lock when this is called (see PR 227619). */

if (IF_DEBUG_ELSE(true, doing_detach)) {
/* Unload all client libs and free any allocated storage */
Expand All @@ -898,6 +896,9 @@ instrument_exit(void)
num_client_libs = 0;
# ifdef WINDOWS
DELETE_LOCK(client_aux_lib64_lock);
# endif
# if defined(WINDOWS) || defined(CLIENT_SIDELINE)
DELETE_LOCK(client_thread_count_lock);
# endif
DELETE_READWRITE_LOCK(callback_registration_lock);
}
Expand Down
5 changes: 2 additions & 3 deletions core/lib/instrument.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ instrument_load_client_libs(void);
void
instrument_init(void);
void
instrument_exit_event(void);
void
instrument_exit(void);
bool
is_in_client_lib(app_pc addr);
Expand Down Expand Up @@ -136,9 +138,6 @@ instrument_kernel_xfer(dcontext_t *dcontext, dr_kernel_xfer_type_t type,

void
instrument_nudge(dcontext_t *dcontext, client_id_t id, uint64 arg);
/* post instrument_event() cleanup */
void
instrument_exit_post_sideline(void);
# ifdef WINDOWS
bool
instrument_exception(dcontext_t *dcontext, dr_exception_t *exception);
Expand Down
Loading

0 comments on commit 9293e7a

Please sign in to comment.