From 481b510123ee19dae0992a069f915d6a082066b0 Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Sun, 7 Feb 2021 03:17:16 +0000 Subject: [PATCH] Enable `RTLD_DEEPBIND` by default, add workaround for musl and FreeBSD We allow the vast majority of functionality for musl and FreeBSD, the only two systems that do not have `RTLD_DEEPBIND` functionality. Our method enables usage of most combinations of BLAS libraries, with the exception of a 32-bit BLAS and a 64-bit BLAS that uses no suffix on its symbols. --- src/Makefile | 2 +- src/autodetection.c | 4 ++ src/dl_utils.c | 10 ++-- src/libblastrampoline.c | 69 ++++++++++++++++++++++++++ src/libblastrampoline_internal.h | 10 ++++ src/libblastrampoline_trampdata.h | 6 +-- src/surrogates.c | 82 +++++++++++++++++++++++++++++++ 7 files changed, 175 insertions(+), 8 deletions(-) create mode 100644 src/surrogates.c diff --git a/src/Makefile b/src/Makefile index 40fd74e..82e92e8 100644 --- a/src/Makefile +++ b/src/Makefile @@ -5,7 +5,7 @@ include $(LBT_ROOT)/src/Make.inc all: $(builddir)/libblastrampoline.$(SHLIB_EXT) # Objects we'll build -MAIN_OBJS := libblastrampoline.o dl_utils.o autodetection.o trampolines/trampolines_$(ARCH).o +MAIN_OBJS := libblastrampoline.o dl_utils.o autodetection.o surrogates.o trampolines/trampolines_$(ARCH).o # Include win_utils.c on windws ifeq ($(OS),WINNT) diff --git a/src/autodetection.c b/src/autodetection.c index f3cf1eb..57b282d 100644 --- a/src/autodetection.c +++ b/src/autodetection.c @@ -44,7 +44,11 @@ int autodetect_blas_interface(void * isamax_addr) { int64_t n = 0xffffffff00000003; float X[3] = {1.0f, 2.0f, 1.0f}; int64_t incx = 1; + + // Override `lsame_` to point to our `fake_lsame` + push_fake_lsame(); int64_t max_idx = isamax(&n, X, &incx); + pop_fake_lsame(); // This means the `isamax()` implementation saw `N < 0`, ergo it's a 64-bit library if (max_idx == 0) { diff --git a/src/dl_utils.c b/src/dl_utils.c index ff7fac2..39e0f5e 100644 --- a/src/dl_utils.c +++ b/src/dl_utils.c @@ -22,8 +22,7 @@ void throw_dl_error(const char * path) { /* - * Load the given `path`, using as close to `RTLD_NOW | RTLD_LOCAL | RTLD_DEEPBIND` - * as possible across all platforms. + * Load the given `path`, using `RTLD_NOW | RTLD_LOCAL` and `RTLD_DEEPBIND`, if available */ void * load_library(const char * path) { void * new_handle = NULL; @@ -35,10 +34,13 @@ void * load_library(const char * path) { exit(1); } new_handle = (void *)LoadLibraryExW(wpath, NULL, LOAD_WITH_ALTERED_SEARCH_PATH); -#elif defined(_OS_DARWIN_) || defined(_OS_FREEBSD_) - new_handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); #else + // If we have `RTLD_DEEPBIND`, use it! +#if defined(RTLD_DEEPBIND) new_handle = dlopen(path, RTLD_NOW | RTLD_LOCAL | RTLD_DEEPBIND); +#else + new_handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); +#endif #endif if (new_handle == NULL) { throw_dl_error(path); diff --git a/src/libblastrampoline.c b/src/libblastrampoline.c index 1988851..7f7506e 100644 --- a/src/libblastrampoline.c +++ b/src/libblastrampoline.c @@ -1,6 +1,11 @@ #include "libblastrampoline_internal.h" #include "libblastrampoline_trampdata.h" +// Sentinel to tell us if we've got a deepbindless workaround active or not +#define DEEPBINDLESS_INTERFACE_LP64_LOADED 0x01 +#define DEEPBINDLESS_INTERFACE_ILP64_LOADED 0x02 +uint8_t deepbindless_interfaces_loaded = 0x00; + /* * Load the given `libname`, lookup all registered symbols within our `exported_func_names` list, * and `dlsym()` the symbol addresses to load the addresses for forwarding into that library. @@ -11,6 +16,10 @@ * with a second shim library, integrating separate BLAS and LAPACK libraries, merging an LP64 and * ILP64 library into one, or all three use cases at the same time. * + * Note that on certain platforms (currently musl linux and freebsd) you cannot load a non-suffixed + * ILP64 and an LP64 BLAS at the same time. Read the note below about lacking RTLD_DEEPBIND + * support in the system libc for more details. + * * If `verbose` is set to a non-zero value, it will print out debugging information. */ JL_DLLEXPORT int load_blas_funcs(const char * libname, int clear, int verbose) { @@ -51,6 +60,60 @@ JL_DLLEXPORT int load_blas_funcs(const char * libname, int clear, int verbose) { } } + /* + * Now, if we are opening a 64-bit library with 32-bit names (e.g. suffix == ""), + * we can handle that... as long as we're on a system where we can tell a library + * to look up its own symbols before consulting the global symbol table. This is + * important so that when e.g. ILP64 `dgemm_` in this library wants to look up + * `foo_`, it needs to find its own `foo_` but it will find the `foo_` trampoline + * in this library unless we have `RTLD_DEEPBIND` semantics. These semantics are + * the default on MacOS and Windows, and on glibc Linux we enable it with the + * dlopen flag `RTLD_DEEPBIND`, but on musl and FreeBSD we don't have access to + * this flag, so we warn the user that they will be unable to load both LP64 and + * ILP64 libraries on this system. I hear support for this is coming in FreeBSD + * 13.0, so some day this may be possible, but I sincerely hope that this + * capability is not something being designed into new applications. + * + * If you are on a system without the ability for `RTLD_DEEPBIND` semantics no + * sweat, this should work just fine as long as you either (a) only use one + * BLAS library at a time, or (b) use two that have properly namespaced their + * symbols with a different suffix. But if you use two different BLAS libraries + * with the same suffix, this library will complain. Loudly. + * + * We track this by setting flags in `deepbindless_interfaces_loaded` to show + * which interfaces have been loaded with an empty suffix; if the user + * attempts to load another one without setting the `clear` flag, we refuse to + * load it on a deepbindless system, printing out to `stderr` if we're verbose. + */ +#if !defined(RTLD_DEEPBIND) && (defined(_OS_LINUX_) || defined(_OS_FREEBSD_)) + // If `clear` is set, we clear our tracking + if (clear) { + deepbindless_interfaces_loaded = 0x00; + } + + // If we ever load an LP64 BLAS, we mark that interface as being loaded since + // we bind to the suffix-"" names, so even if the names of that library + // internally are suffixed to something else, we ourselves will interfere with + // a future suffix-"" ILP64 BLAS. + if (interface == 32) { + deepbindless_interfaces_loaded |= DEEPBINDLESS_INTERFACE_LP64_LOADED; + } + + // We only mark a loaded ILP64 BLAS if it is a suffix-"" BLAS, since that is + // the only case in which it will interfere with our LP64 BLAS symbols. + if (lib_suffix[0] == '\0' && interface == 64) { + deepbindless_interfaces_loaded |= DEEPBINDLESS_INTERFACE_ILP64_LOADED; + } + + // If more than one flag is set, complain. + if (deepbindless_interfaces_loaded == (DEEPBINDLESS_INTERFACE_ILP64_LOADED | DEEPBINDLESS_INTERFACE_LP64_LOADED)) { + if (verbose) { + fprintf(stderr, "ERROR: Cannot load both LP64 and ILP64 BLAS libraries without proper namespacing on an RTLD_DEEPBIND-less system!\n"); + } + return 0; + } +#endif + // Finally, re-export its symbols: int nforwards = 0; int symbol_idx = 0; @@ -75,6 +138,12 @@ JL_DLLEXPORT int load_blas_funcs(const char * libname, int clear, int verbose) { (*exported_func32_addrs[symbol_idx]) = addr; } else { (*exported_func64_addrs[symbol_idx]) = addr; + + // If we're on an RTLD_DEEPBINDless system and our workaround is activated, + // we take over our own 32-bit symbols as well. + if (deepbindless_interfaces_loaded & DEEPBINDLESS_INTERFACE_ILP64_LOADED) { + (*exported_func32_addrs[symbol_idx]) = addr; + } } nforwards++; } diff --git a/src/libblastrampoline_internal.h b/src/libblastrampoline_internal.h index 1600885..b04ba6a 100644 --- a/src/libblastrampoline_internal.h +++ b/src/libblastrampoline_internal.h @@ -34,6 +34,11 @@ // This is the maximum length of a symbol that we'll allow #define MAX_SYMBOL_LEN 64 +// Data defined in `libblastrampoline_trampdata.h +extern const char *const exported_func_names[]; +extern const void ** exported_func32_addrs[]; +extern const void ** exported_func64_addrs[]; + // Functions in `win_utils.c` int wchar_to_utf8(const wchar_t * wstr, char *str, size_t maxlen); int utf8_to_wchar(const char * str, wchar_t * wstr, size_t maxlen); @@ -47,3 +52,8 @@ const char * autodetect_symbol_suffix(void * handle); int autodetect_blas_interface(void * isamax_addr); int autodetect_lapack_interface(void * dpotrf_addr); int autodetect_interface(void * handle, const char * suffix); + +// Functions in surrogates.c +void push_fake_lsame(); +void pop_fake_lsame(); +int fake_lsame(char * ca, char * cb); diff --git a/src/libblastrampoline_trampdata.h b/src/libblastrampoline_trampdata.h index 26c62cd..4f6d2fd 100644 --- a/src/libblastrampoline_trampdata.h +++ b/src/libblastrampoline_trampdata.h @@ -14,7 +14,7 @@ EXPORTED_FUNCS(XX_64) // Generate list of function names #define XX(name) #name, -static const char *const exported_func_names[] = { +const char *const exported_func_names[] = { EXPORTED_FUNCS(XX) NULL }; @@ -23,11 +23,11 @@ static const char *const exported_func_names[] = { // Generate list of function addresses to tie names -> variables #define XX(name) &name##_addr, #define XX_64(name) &name##64__addr, -static const void ** exported_func32_addrs[] = { +const void ** exported_func32_addrs[] = { EXPORTED_FUNCS(XX) NULL }; -static const void ** exported_func64_addrs[] = { +const void ** exported_func64_addrs[] = { EXPORTED_FUNCS(XX_64) NULL }; diff --git a/src/surrogates.c b/src/surrogates.c new file mode 100644 index 0000000..2c80117 --- /dev/null +++ b/src/surrogates.c @@ -0,0 +1,82 @@ +#include "libblastrampoline_internal.h" + +int find_symbol_idx(const char * name) { + for (int symbol_idx=0; exported_func_names[symbol_idx] != NULL; ++symbol_idx) { + if (strcmp(exported_func_names[symbol_idx], "lsame_") == 0) { + return symbol_idx; + } + } + + // This is fatal as it signifies a configuration error in our trampoline symbol list + fprintf(stderr, "Error: Unable to find %s in our symbol list?!\n", name); + exit(1); +} + +int lsame_idx = -1; +const void *old_lsame32 = NULL, *old_lsame64 = NULL; +void push_fake_lsame() { + // Find `lsame_` in our symbol list (if we haven't done so before) + if (lsame_idx == -1) + lsame_idx = find_symbol_idx("lsame_"); + + // Save old values of `lsame_` and `lsame_64_` to our swap location + old_lsame32 = (*exported_func32_addrs[lsame_idx]); + old_lsame64 = (*exported_func64_addrs[lsame_idx]); + + // Insert our "fake" lsame in so that we always have a half-functional copy + (*exported_func32_addrs[lsame_idx]) = &fake_lsame; + (*exported_func64_addrs[lsame_idx]) = &fake_lsame; +} + +void pop_fake_lsame() { + if (lsame_idx == -1) { + // Did you call `pop_fake_lsame()` without calling `push_fake_lsame()` first?! + fprintf(stderr, "pop_fake_lsame() called with invalid `lsame_idx`!\n"); + exit(1); + } + + (*exported_func32_addrs[lsame_idx]) = old_lsame32; + (*exported_func64_addrs[lsame_idx]) = old_lsame64; + + old_lsame32 = NULL; + old_lsame64 = NULL; +} + + +/* `lsame_` implementation taken from `http://www.netlib.org/clapack/cblas/lsame.c`*/ +int fake_lsame(char * ca, char * cb) { + /* Local variables */ + static int inta, intb, zcode; + + if (*(unsigned char *)ca == *(unsigned char *)cb) { + return 1; + } + + zcode = 'Z'; + inta = *(unsigned char *)ca; + intb = *(unsigned char *)cb; + + if (zcode == 90 || zcode == 122) { + if (inta >= 97 && inta <= 122) { + inta += -32; + } + if (intb >= 97 && intb <= 122) { + intb += -32; + } + } else if (zcode == 233 || zcode == 169) { + if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta >= 162 && inta <= 169) { + inta += 64; + } + if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb >= 162 && intb <= 169) { + intb += 64; + } + } else if (zcode == 218 || zcode == 250) { + if (inta >= 225 && inta <= 250) { + inta += -32; + } + if (intb >= 225 && intb <= 250) { + intb += -32; + } + } + return inta == intb; +} \ No newline at end of file