diff --git a/.github/workflows/build-all.yml b/.github/workflows/build-all.yml
index 22b71a2b..62617826 100644
--- a/.github/workflows/build-all.yml
+++ b/.github/workflows/build-all.yml
@@ -5,6 +5,7 @@ on:
     - cron: '0 0 * * *'
   release:
     types: [ published ]
+  workflow_dispatch:
 
 jobs:
   build-tarballs:
@@ -39,7 +40,7 @@ jobs:
       run: ./dist.sh ${{ matrix.target }}
 
     - name: Upload artifact
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
       with:
-        name: tarballs
+        name: ${{ matrix.target }}
         path: mold-*.tar.gz
diff --git a/.github/workflows/build-x86.yml b/.github/workflows/build-x86.yml
index 1866691c..828a9f14 100644
--- a/.github/workflows/build-x86.yml
+++ b/.github/workflows/build-x86.yml
@@ -3,6 +3,7 @@ name: Build x86 tarball
 on:
   push:
     branches: [ main ]
+  workflow_dispatch:
 
 jobs:
   build-tarball:
@@ -30,7 +31,7 @@ jobs:
       run: ./dist.sh
 
     - name: Upload artifact
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
       with:
         name: tarball
         path: mold-*.tar.gz
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3015849f..16e27fac 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,40 +1,30 @@
 name: CI
 on:
   push:
-    branches: [ main ]
   pull_request:
-    branches: [ main ]
 env:
   UBSAN_OPTIONS: print_stacktrace=1:halt_on_error=1
 jobs:
-  build-clang:
+  build-sanitizers:
     strategy:
       matrix:
         target:
-        # Disable PCH for the default configuration. This prevents relying on implicit includes.
-        - '-DCMAKE_DISABLE_PRECOMPILE_HEADERS=On'
+        - ''
         - '-DMOLD_USE_ASAN=On'
         - '-DMOLD_USE_TSAN=On'
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v3
     - uses: rui314/setup-mold@staging
-    - name: install-build-deps
-      run: sudo ./install-build-deps.sh
-    - name: ccache
-      uses: hendrikmuhs/ccache-action@v1
+    - run: sudo ./install-build-deps.sh
     - name: build
       run: |
-        echo "/usr/lib/ccache:/usr/local/opt/ccache/libexec" >> $GITHUB_PATH
-        sudo apt-get install -y clang++-12
+        sudo apt-get install -y clang-18 clang gcc-multilib gdb dwarfdump zstd
         mkdir build
         cd build
-        cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++-12 ${{ matrix.target }} ..
+        cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 ${{ matrix.target }} ..
         cmake --build . -j$(nproc)
-    - name: test
-      run: |
-        cd build
-        ctest -j$(nproc)
+    - run: ctest --test-dir build -j$(nproc)
     - name: archive test results
       uses: actions/upload-artifact@v3
       if: failure()
@@ -44,8 +34,8 @@ jobs:
           build
           !build/CMakeFiles
 
-  build-gcc:
-    runs-on: ubuntu-20.04
+  build-multi-archs:
+    runs-on: ubuntu-latest
     container: gcc:11.1.0
     steps:
     - uses: actions/checkout@v3
@@ -72,34 +62,29 @@ jobs:
 
         # Install a LoongArch toolchain
         mkdir /larch
-        wget -O- -q https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz | tar -C /larch --strip-components=1 --xz -xf -
+        wget -O- -q https://github.com/loongson/build-tools/releases/download/2024.08.08/x86_64-cross-tools-loongarch64-binutils_2.43-gcc_14.2.0-glibc_2.40.tar.xz | tar -C /larch --strip-components=1 --xz -xf -
 
+        cp -r /larch/loongarch64-unknown-linux-gnu/lib/* /larch/target/lib64
         ln -sf /larch/target /usr/loongarch64-linux-gnu
-        cp -r /larch/loongarch64-unknown-linux-gnu/lib/* /usr/loongarch64-linux-gnu/lib64/
 
-        for i in objdump objcopy strip; do
+        for i in gcc g++ objdump objcopy strip; do
           ln -sf /larch/bin/loongarch64-unknown-linux-gnu-$i /usr/bin/loongarch64-linux-gnu-$i
         done
 
-        echo '/larch/bin/loongarch64-unknown-linux-gnu-gcc -L/larch/loongarch64-unknown-linux-gnu "$@"' > /usr/bin/loongarch64-linux-gnu-gcc
-        echo '/larch/bin/loongarch64-unknown-linux-gnu-g++ -L/larch/loongarch64-unknown-linux-gnu "$@"' > /usr/bin/loongarch64-linux-gnu-g++
-        chmod 755 /usr/bin/loongarch64-linux-gnu-{gcc,g++}
-
         wget -O /usr/local/bin/qemu-loongarch64 -q https://github.com/loongson/build-tools/releases/download/2023.08.08/qemu-loongarch64
         chmod 755 /usr/local/bin/qemu-loongarch64
-    - name: ccache
-      uses: hendrikmuhs/ccache-action@v1
+
+        # Install Intel SDE CPU emulator for CET-related tests
+        mkdir /sde
+        wget -O- -q https://downloadmirror.intel.com/813591/sde-external-9.33.0-2024-01-07-lin.tar.xz | tar -C /sde --strip-components=1 --xz -xf -
+        ln -s /sde/sde /usr/bin
     - name: build
       run: |
-        echo "/usr/lib/ccache:/usr/local/opt/ccache/libexec" >> $GITHUB_PATH
         mkdir build
         cd build
         cmake ..
         cmake --build . -j$(nproc)
-    - name: test
-      run: |
-        cd build
-        ctest -j$(nproc)
+    - run: ctest --test-dir build -j$(nproc)
     - name: archive test results
       uses: actions/upload-artifact@v3
       if: failure()
@@ -109,24 +94,38 @@ jobs:
           build
           !build/CMakeFiles
 
-  build-macos:
-    runs-on: macos-11
+  build-distros:
     strategy:
       matrix:
-        target:
-        # Disable PCH for the default configuration. This prevents relying on implicit includes.
-        - '-DCMAKE_DISABLE_PRECOMPILE_HEADERS=On'
-        - '-DMOLD_USE_ASAN=On'
+        distro:
+        - alpine
+        - archlinux
+        - fedora
+        - gentoo/stage3
+        - opensuse/tumbleweed
+        - ubuntu:22.04
+    runs-on: ubuntu-latest
+    container: ${{ matrix.distro }}
+    steps:
+    - uses: actions/checkout@v2
+    - run: ./install-build-deps.sh
+    - name: build
+      run: |
+        mkdir build
+        cd build
+        cmake ..
+        cmake --build . -j$(nproc)
+    - run: ctest --test-dir build -j$(nproc)
+
+  build-macos:
+    runs-on: macos-12
     steps:
     - uses: actions/checkout@v3
-    - name: ccache
-      uses: hendrikmuhs/ccache-action@v1
     - name: build
       run: |
-        echo "/usr/lib/ccache:/usr/local/opt/ccache/libexec" >> $GITHUB_PATH
         mkdir build
         cd build
-        cmake ${{ matrix.target }} ..
+        cmake ..
         cmake --build . -j$(sysctl -n hw.physicalcpu)
 
   build-windows:
@@ -155,5 +154,21 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake -GNinja -DMOLD_USE_MIMALLOC=OFF -DMOLD_USE_SYSTEM_TBB=ON -DMOLD_USE_MOLD=OFF ..
+        cmake -GNinja -DMOLD_USE_MIMALLOC=OFF -DMOLD_USE_SYSTEM_TBB=ON ..
         cmake --build . -j $(nproc)
+
+  build-freebsd:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Build and test
+      uses: vmactions/freebsd-vm@v1
+      with:
+        usesh: true
+        run: |
+          ./install-build-deps.sh
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . -j$(nproc)
+          ctest -j$(nproc)
diff --git a/.github/workflows/update-manpage.yml b/.github/workflows/update-manpage.yml
index e4442a72..1107e774 100644
--- a/.github/workflows/update-manpage.yml
+++ b/.github/workflows/update-manpage.yml
@@ -1,5 +1,3 @@
-# This file is generated by ChatGPT
-
 name: Update manpage
 
 on:
@@ -8,6 +6,7 @@ on:
       - 'docs/mold.md'
     branches:
       - main
+  workflow_dispatch:
 
 jobs:
   update-manpage:
@@ -18,7 +17,7 @@ jobs:
       uses: actions/checkout@v2
 
     - name: Install ronn
-      run: sudo apt-get install -y ronn
+      run: sudo apt-get update && sudo apt-get install -y ronn
 
     - name: Generate mold.1 from mold.md
       run: ronn --roff docs/mold.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad31bf03..d6d1500a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@
 # features and behave exactly the same.
 
 cmake_minimum_required(VERSION 3.14)
-project(mold VERSION 2.32.1)
+project(mold VERSION 2.34.0)
 
 include(CMakeDependentOption)
 include(CheckSymbolExists)
@@ -61,6 +61,8 @@ target_compile_features(mold PRIVATE cxx_std_20)
 
 if(MINGW)
   target_link_libraries(mold PRIVATE dl)
+else()
+  target_link_libraries(mold PRIVATE ${CMAKE_DL_LIBS})
 endif()
 
 if(NOT "${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "MSVC")
@@ -250,7 +252,7 @@ if(NOT APPLE AND NOT WIN32)
   # Remove the default `lib` prefix
   set_target_properties(mold-wrapper PROPERTIES PREFIX "")
   target_link_libraries(mold-wrapper PRIVATE ${CMAKE_DL_LIBS})
-  target_sources(mold-wrapper PRIVATE elf/mold-wrapper.c)
+  target_sources(mold-wrapper PRIVATE src/mold-wrapper.c)
 endif()
 
 # If atomics doesn't work by default, add -latomic.
@@ -275,12 +277,6 @@ if(NOT APPLE AND NOT MSVC)
   target_link_options(mold PRIVATE -pthread)
 endif()
 
-# shm_open needs -lrt
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(mold PRIVATE rt)
-endif()
-
 check_symbol_exists(madvise sys/mman.h HAVE_MADVISE)
 
 # Create a .cc file containing the current git hash for `mold --version`.
@@ -288,15 +284,15 @@ add_custom_target(git_hash
   COMMAND ${CMAKE_COMMAND}
     -DSOURCE_DIR=${CMAKE_SOURCE_DIR}
     -DOUTPUT_FILE=${CMAKE_BINARY_DIR}/git-hash.cc
-    -P ${CMAKE_SOURCE_DIR}/common/update-git-hash.cmake
-  DEPENDS common/update-git-hash.cmake
+    -P ${CMAKE_SOURCE_DIR}/lib/update-git-hash.cmake
+  DEPENDS lib/update-git-hash.cmake
   BYPRODUCTS git-hash.cc
   VERBATIM)
 
 add_dependencies(mold git_hash)
 
 # Create config.h file
-configure_file(common/config.h.in config.h)
+configure_file(lib/config.h.in config.h)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 # Almost all functions are template in mold which take a target type
@@ -309,32 +305,44 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # on a multicore machine.
 list(APPEND MOLD_ELF_TARGETS
   X86_64 I386 ARM64 ARM32 RV32LE RV32BE RV64LE RV64BE PPC32 PPC64V1 PPC64V2
-  S390X SPARC64 M68K SH4 ALPHA LOONGARCH32 LOONGARCH64)
+  S390X SPARC64 M68K SH4 LOONGARCH32 LOONGARCH64)
 
 list(APPEND MOLD_ELF_TEMPLATE_FILES
-  elf/arch-loongarch.cc
-  elf/arch-riscv.cc
-  elf/cmdline.cc
-  elf/gc-sections.cc
-  elf/gdb-index.cc
-  elf/icf.cc
-  elf/input-files.cc
-  elf/input-sections.cc
-  elf/linker-script.cc
-  elf/main.cc
-  elf/mapfile.cc
-  elf/output-chunks.cc
-  elf/passes.cc
-  elf/relocatable.cc
-  elf/subprocess.cc
-  elf/thunks.cc
-  elf/tls.cc
+  src/arch-loongarch.cc
+  src/arch-riscv.cc
+  src/cmdline.cc
+  src/gc-sections.cc
+  src/gdb-index.cc
+  src/icf.cc
+  src/input-files.cc
+  src/input-sections.cc
+  src/linker-script.cc
+  src/main.cc
+  src/mapfile.cc
+  src/output-chunks.cc
+  src/passes.cc
+  src/relocatable.cc
+  src/shrink-sections.cc
+  src/thunks.cc
+  src/tls.cc
   )
 
 if(WIN32 AND NOT MINGW)
-  list(APPEND MOLD_ELF_TEMPLATE_FILES elf/lto-win32.cc)
+  list(APPEND MOLD_ELF_TEMPLATE_FILES src/lto-win32.cc)
+else()
+  list(APPEND MOLD_ELF_TEMPLATE_FILES src/lto-unix.cc)
+endif()
+
+if(WIN32)
+  list(APPEND MOLD_ELF_TEMPLATE_FILES
+    src/output-file-win32.cc
+    src/subprocess-win32.cc
+    )
 else()
-  list(APPEND MOLD_ELF_TEMPLATE_FILES elf/lto-unix.cc)
+  list(APPEND MOLD_ELF_TEMPLATE_FILES
+    src/output-file-unix.cc
+    src/subprocess-unix.cc
+  )
 endif()
 
 function(mold_instantiate_templates SOURCE TARGET)
@@ -356,58 +364,48 @@ endforeach()
 
 # Add other non-template source files.
 target_sources(mold PRIVATE
-  common/compress.cc
-  common/demangle.cc
-  common/filepath.cc
-  common/glob.cc
-  common/hyperloglog.cc
-  common/malloc.cc
-  common/multi-glob.cc
-  common/perf.cc
-  common/random.cc
-  common/tar.cc
-  elf/arch-alpha.cc
-  elf/arch-arm32.cc
-  elf/arch-arm64.cc
-  elf/arch-i386.cc
-  elf/arch-m68k.cc
-  elf/arch-ppc32.cc
-  elf/arch-ppc64v1.cc
-  elf/arch-ppc64v2.cc
-  elf/arch-s390x.cc
-  elf/arch-sh4.cc
-  elf/arch-sparc64.cc
-  elf/arch-x86-64.cc
-  elf/config.cc
-  elf/elf.cc
   git-hash.cc
+  lib/compress.cc
+  lib/crc32.cc
+  lib/demangle.cc
+  lib/filepath.cc
+  lib/glob.cc
+  lib/hyperloglog.cc
+  lib/malloc.cc
+  lib/multi-glob.cc
+  lib/perf.cc
+  lib/random.cc
+  lib/tar.cc
+  src/arch-arm32.cc
+  src/arch-arm64.cc
+  src/arch-i386.cc
+  src/arch-m68k.cc
+  src/arch-ppc32.cc
+  src/arch-ppc64v1.cc
+  src/arch-ppc64v2.cc
+  src/arch-s390x.cc
+  src/arch-sh4.cc
+  src/arch-sparc64.cc
+  src/arch-x86-64.cc
+  src/config.cc
+  src/elf.cc
   third-party/rust-demangle/rust-demangle.c
   )
 
 if(WIN32)
   target_sources(mold PRIVATE
-    common/jobs-win32.cc
-    common/mapped-file-win32.cc
-    common/signal-win32.cc
+    lib/jobs-win32.cc
+    lib/mapped-file-win32.cc
+    lib/signal-win32.cc
     )
 else()
   target_sources(mold PRIVATE
-    common/jobs-unix.cc
-    common/mapped-file-unix.cc
-    common/signal-unix.cc
+    lib/jobs-unix.cc
+    lib/mapped-file-unix.cc
+    lib/signal-unix.cc
     )
 endif()
 
-# Add frequently included header files for pre-compiling.
-# target_precompile_headers is supported by CMake 3.16.0 or newer.
-if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.16.0")
-  # ccache needs this flag along with `sloppiness = pch_defines,time_macros`
-  # to enable caching
-  if(NOT MSVC)
-    target_compile_options(mold PRIVATE -fpch-preprocess)
-  endif()
-endif()
-
 include(CTest)
 
 if(BUILD_TESTING)
@@ -422,7 +420,7 @@ if(BUILD_TESTING)
   endif()
 
   if(${UNIX})
-    add_subdirectory(test/elf)
+    add_subdirectory(test)
   endif()
 endif()
 
diff --git a/README.md b/README.md
index 7bcbfa38..c6ddf37b 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,7 @@ free to [file a bug report](https://github.com/rui314/mold/issues).
 
 mold supports x86-64, i386, ARM64, ARM32, 64-bit/32-bit little/big-endian
 RISC-V, 32-bit PowerPC, 64-bit big-endian PowerPC ELFv1, 64-bit little-endian
-PowerPC ELFv2, s390x, 64-bit/32-bit LoongArch, SPARC64, m68k, SH-4, and DEC
-Alpha.
+PowerPC ELFv2, s390x, 64-bit/32-bit LoongArch, SPARC64, m68k, and SH-4.
 
 ## Why does linking speed matter?
 
@@ -133,7 +132,7 @@ may be able to remove the `linker = "clang"` line.
 
 ```toml
 [target.x86_64-unknown-linux-gnu]
-rustflags = ["-C", "link-arg=-fuse-ld=/path/to/mold"]
+rustflags = ["-C", "link-arg=-fuse-ld=mold"]
 ```
 
 If you want to use mold for all projects, add the above snippet to
diff --git a/common/integers.h b/common/integers.h
deleted file mode 100644
index 2ad02d0c..00000000
--- a/common/integers.h
+++ /dev/null
@@ -1,221 +0,0 @@
-// This file defines integral types for file input/output. We need to use
-// these types instead of the plain integers (such as uint32_t or int32_t)
-// when reading from/writing to an mmap'ed file area for the following
-// reasons:
-//
-// 1. mold is always a cross linker and should not depend on what host it
-//    is running on. Users should be able to run mold on a big-endian
-//    SPARC machine to create a little-endian RV64 binary, for example.
-//
-// 2. Even though data members in all ELF data strucutres are naturally
-//    aligned, they are not guaranteed to be aligned on memory. Because
-//    archive file (.a file) aligns each member only to a 2 byte boundary,
-//    anything larger than 2 bytes may be unaligned in an mmap'ed memory.
-//    Unaligned access is an undefined behavior in C/C++, so we shouldn't
-//    cast an arbitrary pointer to a uint32_t, for example, to read a
-//    32-bits value.
-//
-// The data types defined in this file don't depend on host byte order and
-// don't do unaligned access.
-
-#pragma once
-
-#include <bit>
-#include <cstdint>
-#include <cstring>
-
-#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
-# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#  define __LITTLE_ENDIAN__ 1
-# elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#  define __BIG_ENDIAN__ 1
-# else
-#  error "unknown host byte order"
-# endif
-#endif
-
-namespace mold {
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-typedef int8_t i8;
-typedef int16_t i16;
-typedef int32_t i32;
-typedef int64_t i64;
-
-template <typename T>
-static inline T bswap(T val) {
-  switch (sizeof(T)) {
-  case 2:  return __builtin_bswap16(val);
-  case 4:  return __builtin_bswap32(val);
-  case 8:  return __builtin_bswap64(val);
-  default: __builtin_unreachable();
-  }
-}
-
-template <typename T, int SIZE = sizeof(T)>
-class LittleEndian {
-public:
-  LittleEndian() = default;
-  LittleEndian(T x) { *this = x; }
-
-  operator T() const {
-    if constexpr (sizeof(T) == SIZE) {
-      T x;
-      memcpy(&x, val, sizeof(T));
-      if constexpr (std::endian::native == std::endian::big)
-        x = bswap(x);
-      return x;
-    } else {
-      static_assert(SIZE == 3);
-      return (val[2] << 16) | (val[1] << 8) | val[0];
-    }
-  }
-
-  LittleEndian &operator=(T x) {
-    if constexpr (sizeof(T) == SIZE) {
-      if constexpr (std::endian::native == std::endian::big)
-        x = bswap(x);
-      memcpy(val, &x, sizeof(T));
-    } else {
-      static_assert(SIZE == 3);
-      val[2] = x >> 16;
-      val[1] = x >> 8;
-      val[0] = x;
-    }
-    return *this;
-  }
-
-  LittleEndian &operator++() {
-    return *this = *this + 1;
-  }
-
-  LittleEndian operator++(int) {
-    T ret = *this;
-    *this = *this + 1;
-    return ret;
-  }
-
-  LittleEndian &operator--() {
-    return *this = *this - 1;
-  }
-
-  LittleEndian operator--(int) {
-    T ret = *this;
-    *this = *this - 1;
-    return ret;
-  }
-
-  LittleEndian &operator+=(T x) {
-    return *this = *this + x;
-  }
-
-  LittleEndian &operator-=(T x) {
-    return *this = *this - x;
-  }
-
-  LittleEndian &operator&=(T x) {
-    return *this = *this & x;
-  }
-
-  LittleEndian &operator|=(T x) {
-    return *this = *this | x;
-  }
-
-private:
-  u8 val[SIZE];
-};
-
-using il16 = LittleEndian<i16>;
-using il32 = LittleEndian<i32>;
-using il64 = LittleEndian<i64>;
-using ul16 = LittleEndian<u16>;
-using ul24 = LittleEndian<u32, 3>;
-using ul32 = LittleEndian<u32>;
-using ul64 = LittleEndian<u64>;
-
-template <typename T, int SIZE = sizeof(T)>
-class BigEndian {
-public:
-  BigEndian() = default;
-  BigEndian(T x) { *this = x; }
-
-  operator T() const {
-    if constexpr (sizeof(T) == SIZE) {
-      T x;
-      memcpy(&x, val, sizeof(T));
-      if constexpr (std::endian::native == std::endian::little)
-        x = bswap(x);
-      return x;
-    } else {
-      static_assert(SIZE == 3);
-      return (val[0] << 16) | (val[1] << 8) | val[2];
-    }
-  }
-
-  BigEndian &operator=(T x) {
-    if constexpr (sizeof(T) == SIZE) {
-      if constexpr (std::endian::native == std::endian::little)
-        x = bswap(x);
-      memcpy(val, &x, sizeof(T));
-    } else {
-      static_assert(SIZE == 3);
-      val[0] = x >> 16;
-      val[1] = x >> 8;
-      val[2] = x;
-    }
-    return *this;
-  }
-
-  BigEndian &operator++() {
-    return *this = *this + 1;
-  }
-
-  BigEndian operator++(int) {
-    T ret = *this;
-    *this = *this + 1;
-    return ret;
-  }
-
-  BigEndian &operator--() {
-    return *this = *this - 1;
-  }
-
-  BigEndian operator--(int) {
-    T ret = *this;
-    *this = *this - 1;
-    return ret;
-  }
-
-  BigEndian &operator+=(T x) {
-    return *this = *this + x;
-  }
-
-  BigEndian &operator-=(T x) {
-    return *this = *this - x;
-  }
-
-  BigEndian &operator&=(T x) {
-    return *this = *this & x;
-  }
-
-  BigEndian &operator|=(T x) {
-    return *this = *this | x;
-  }
-
-private:
-  u8 val[SIZE];
-};
-
-using ib16 = BigEndian<i16>;
-using ib32 = BigEndian<i32>;
-using ib64 = BigEndian<i64>;
-using ub16 = BigEndian<u16>;
-using ub24 = BigEndian<u32, 3>;
-using ub32 = BigEndian<u32>;
-using ub64 = BigEndian<u64>;
-
-} // namespace mold
diff --git a/common/jobs-unix.cc b/common/jobs-unix.cc
deleted file mode 100644
index c101388e..00000000
--- a/common/jobs-unix.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-// Many build systems attempt to invoke as many linker processes as there
-// are cores, based on the assumption that the linker is single-threaded.
-// However, since mold is multi-threaded, such build systems' behavior is
-// not beneficial and just increases the overall peak memory usage.
-// On machines with limited memory, this could lead to an out-of-memory
-// error.
-//
-// This file implements a feature that limits the number of concurrent
-// mold processes to just 1 for each user. It is intended to be used as
-// `MOLD_JOBS=1 ninja` or `MOLD_JOBS=1 make -j$(nproc)`.
-//
-// We can't use POSIX semaphores because the counter will not be
-// decremented automatically when a process exits abnormally. That would
-// results in a deadlock. Therefore, we use lockf-based regional file
-// locking instead. Unlike POSIX semaphores, the lock will automatically
-// released on process termination.
-//
-// To wake processes that may be waiting on the lock file, we use a
-// pthread condition variable. On normal exit, mold sends notifications to
-// all waiting processes. In case of abnormal exit, we use
-// pthread_cond_timedwait so that waiters will not wait forever.
-
-#include "common.h"
-
-#include <atomic>
-#include <fcntl.h>
-#include <pthread.h>
-#include <pwd.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-namespace mold {
-
-static constexpr i64 MAX_JOBS = 128;
-
-struct SharedData {
-  std::atomic_bool initialized;
-  pthread_mutex_t mu;
-  pthread_cond_t cond;
-};
-
-static int num_jobs = -1;
-static int lock_fd = -1;
-static SharedData *shared_data = nullptr;
-
-static i64 get_mold_jobs() {
-  char *env = getenv("MOLD_JOBS");
-  if (!env)
-    return 0;
-
-  i64 jobs = std::stol(env);
-  if (jobs < 0)
-    return 0;
-  return std::min(jobs, MAX_JOBS);
-}
-
-static bool do_lock() {
-  for (i64 i = 0; i < num_jobs; i++) {
-    lseek(lock_fd, i, SEEK_SET);
-    if (lockf(lock_fd, F_TLOCK, 1) == 0)
-      return true;
-  }
-  return false;
-}
-
-static SharedData *get_shared_data() {
-  // Create a shared memory object and mmap it
-  std::string name = "/mold-signal-" + std::to_string(getuid());
-  i64 size = sizeof(SharedData);
-
-  int shm_fd = shm_open(name.c_str(), O_CREAT | O_RDWR, 0600);
-  if (shm_fd == -1) {
-    perror("shm_open");
-    exit(1);
-  }
-
-  if (ftruncate(shm_fd, size) == -1) {
-    perror("ftruncate");
-    exit(1);
-  }
-
-  SharedData *data = (SharedData *)mmap(0, size, PROT_READ | PROT_WRITE,
-                                        MAP_SHARED, shm_fd, 0);
-  close(shm_fd);
-
-  if (data->initialized.exchange(true) == false) {
-    pthread_mutexattr_t mu_attr;
-    pthread_mutexattr_init(&mu_attr);
-    pthread_mutexattr_setpshared(&mu_attr, PTHREAD_PROCESS_SHARED);
-
-#ifndef __APPLE__
-    pthread_mutexattr_setrobust(&mu_attr, PTHREAD_MUTEX_ROBUST);
-#endif
-
-    pthread_mutex_init(&data->mu, &mu_attr);
-
-    pthread_condattr_t cond_attr;
-    pthread_condattr_init(&cond_attr);
-    pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
-    pthread_cond_init(&data->cond, &cond_attr);
-  }
-  return data;
-}
-
-void acquire_global_lock() {
-  num_jobs = get_mold_jobs();
-  if (num_jobs == 0)
-    return;
-
-  shared_data = get_shared_data();
-
-  std::string path;
-  if (char *dir = getenv("XDG_RUNTIME_DIR"))
-    path = dir + "/mold.lock"s;
-  else
-    path = "/tmp/mold-" + std::to_string(getuid()) + ".lock";
-
-  lock_fd = open(path.c_str(), O_WRONLY | O_CREAT | O_CLOEXEC, 0600);
-  if (lock_fd == -1 || do_lock())
-    return;
-
-  pthread_mutex_t *mu = &shared_data->mu;
-  pthread_cond_t *cond = &shared_data->cond;
-  int r = pthread_mutex_lock(mu);
-
-#ifndef __APPLE__
-  // If the previous process got killed while holding the mutex, the
-  // mutex has became inconsistent. We need to fix it in that case.
-  if (r == EOWNERDEAD)
-    pthread_mutex_consistent(mu);
-#endif
-
-  for (;;) {
-    struct timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
-    ts.tv_sec += 1;
-
-    int r = pthread_cond_timedwait(cond, mu, &ts);
-    if (do_lock() || r != ETIMEDOUT)
-      break;
-  }
-
-  pthread_mutex_unlock(mu);
-}
-
-void release_global_lock() {
-  if (lock_fd == -1)
-    return;
-  close(lock_fd);
-  pthread_cond_broadcast(&shared_data->cond);
-}
-
-} // namespace mold
diff --git a/common/output-file-unix.h b/common/output-file-unix.h
deleted file mode 100644
index e09a1867..00000000
--- a/common/output-file-unix.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#include "common.h"
-
-#include <fcntl.h>
-#include <filesystem>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-namespace mold {
-
-inline u32 get_umask() {
-  u32 orig_umask = umask(0);
-  umask(orig_umask);
-  return orig_umask;
-}
-
-template <typename Context>
-static std::pair<i64, char *>
-open_or_create_file(Context &ctx, std::string path, i64 filesize, i64 perm) {
-  std::string tmpl = filepath(path).parent_path() / ".mold-XXXXXX";
-  char *path2 = (char *)save_string(ctx, tmpl).data();
-
-  i64 fd = mkstemp(path2);
-  if (fd == -1)
-    Fatal(ctx) << "cannot open " << path2 <<  ": " << errno_string();
-
-  // Reuse an existing file if exists and writable because on Linux,
-  // writing to an existing file is much faster than creating a fresh
-  // file and writing to it.
-  if (ctx.overwrite_output_file && rename(path.c_str(), path2) == 0) {
-    ::close(fd);
-    fd = ::open(path2, O_RDWR | O_CREAT, perm);
-    if (fd != -1 && !ftruncate(fd, filesize) && !fchmod(fd, perm & ~get_umask()))
-      return {fd, path2};
-
-    unlink(path2);
-    fd = ::open(path2, O_RDWR | O_CREAT, perm);
-    if (fd == -1)
-      Fatal(ctx) << "cannot open " << path2 << ": " << errno_string();
-  }
-
-  if (fchmod(fd, (perm & ~get_umask())) == -1)
-    Fatal(ctx) << "fchmod failed: " << errno_string();
-
-#ifdef __linux__
-  if (fallocate(fd, 0, 0, filesize) == 0)
-    return {fd, path2};
-#endif
-
-  if (ftruncate(fd, filesize) == -1)
-    Fatal(ctx) << "ftruncate failed: " << errno_string();
-  return {fd, path2};
-}
-
-template <typename Context>
-class MemoryMappedOutputFile : public OutputFile<Context> {
-public:
-  MemoryMappedOutputFile(Context &ctx, std::string path, i64 filesize, i64 perm)
-    : OutputFile<Context>(path, filesize, true) {
-    std::tie(this->fd, output_tmpfile) =
-      open_or_create_file(ctx, path, filesize, perm);
-
-    this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE,
-                           MAP_SHARED, this->fd, 0);
-    if (this->buf == MAP_FAILED)
-      Fatal(ctx) << path << ": mmap failed: " << errno_string();
-
-    mold::output_buffer_start = this->buf;
-    mold::output_buffer_end = this->buf + filesize;
-  }
-
-  ~MemoryMappedOutputFile() {
-    if (fd2 != -1)
-      ::close(fd2);
-  }
-
-  void close(Context &ctx) override {
-    Timer t(ctx, "close_file");
-
-    if (!this->is_unmapped)
-      munmap(this->buf, this->filesize);
-
-    if (this->buf2.empty()) {
-      ::close(this->fd);
-    } else {
-      FILE *out = fdopen(this->fd, "w");
-      fseek(out, 0, SEEK_END);
-      fwrite(&this->buf2[0], this->buf2.size(), 1, out);
-      fclose(out);
-    }
-
-    // If an output file already exists, open a file and then remove it.
-    // This is the fastest way to unlink a file, as it does not make the
-    // system to immediately release disk blocks occupied by the file.
-    fd2 = ::open(this->path.c_str(), O_RDONLY);
-    if (fd2 != -1)
-      unlink(this->path.c_str());
-
-    if (rename(output_tmpfile, this->path.c_str()) == -1)
-      Fatal(ctx) << this->path << ": rename failed: " << errno_string();
-    output_tmpfile = nullptr;
-  }
-
-private:
-  int fd2 = -1;
-};
-
-template <typename Context>
-std::unique_ptr<OutputFile<Context>>
-OutputFile<Context>::open(Context &ctx, std::string path, i64 filesize, i64 perm) {
-  Timer t(ctx, "open_file");
-
-  if (path.starts_with('/') && !ctx.arg.chroot.empty())
-    path = ctx.arg.chroot + "/" + path_clean(path);
-
-  bool is_special = false;
-  if (path == "-") {
-    is_special = true;
-  } else {
-    struct stat st;
-    if (stat(path.c_str(), &st) == 0 && (st.st_mode & S_IFMT) != S_IFREG)
-      is_special = true;
-  }
-
-  OutputFile<Context> *file;
-  if (is_special)
-    file = new MallocOutputFile(ctx, path, filesize, perm);
-  else
-    file = new MemoryMappedOutputFile(ctx, path, filesize, perm);
-
-#ifdef MADV_HUGEPAGE
-  // Enable transparent huge page for an output memory-mapped file.
-  // On Linux, it has an effect only on tmpfs mounted with `huge=advise`,
-  // but it can make the linker ~10% faster. You can try it by creating
-  // a tmpfs with the following commands
-  //
-  //  $ mkdir tmp
-  //  $ sudo mount -t tmpfs -o size=2G,huge=advise none tmp
-  //
-  // and then specifying a path under the directory as an output file.
-  madvise(file->buf, filesize, MADV_HUGEPAGE);
-#endif
-
-  if (ctx.arg.filler != -1)
-    memset(file->buf, ctx.arg.filler, filesize);
-  return std::unique_ptr<OutputFile>(file);
-}
-
-} // namespace mold
diff --git a/common/output-file-win32.h b/common/output-file-win32.h
deleted file mode 100644
index 5fc92496..00000000
--- a/common/output-file-win32.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#include "common.h"
-
-#include <fcntl.h>
-#include <filesystem>
-#include <windows.h>
-
-namespace mold {
-
-template <typename Context>
-class MemoryMappedOutputFile : public OutputFile<Context> {
-public:
-  MemoryMappedOutputFile(Context &ctx, std::string path, i64 filesize, i64 perm)
-      : OutputFile<Context>(path, filesize, true) {
-    // TODO: use intermediate temporary file for output.
-    DWORD file_attrs =
-        (perm & 0200) ? FILE_ATTRIBUTE_NORMAL : FILE_ATTRIBUTE_READONLY;
-    file_handle =
-        CreateFileA(path.c_str(), GENERIC_READ | GENERIC_WRITE,
-                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                    nullptr, CREATE_ALWAYS, file_attrs, nullptr);
-    if (file_handle == INVALID_HANDLE_VALUE)
-      Fatal(ctx) << "cannot open " << path << ": " << GetLastError();
-
-    HANDLE mapping_handle = CreateFileMapping(
-        file_handle, nullptr, PAGE_READWRITE, 0, filesize, nullptr);
-    if (!mapping_handle)
-      Fatal(ctx) << path << ": CreateFileMapping failed: " << GetLastError();
-
-    this->buf =
-        (u8 *)MapViewOfFile(mapping_handle, FILE_MAP_WRITE, 0, 0, filesize);
-    CloseHandle(mapping_handle);
-    if (!this->buf)
-      Fatal(ctx) << path << ": MapViewOfFile failed: " << GetLastError();
-
-    mold::output_buffer_start = this->buf;
-    mold::output_buffer_end = this->buf + filesize;
-  }
-
-  ~MemoryMappedOutputFile() {
-    if (file_handle != INVALID_HANDLE_VALUE)
-      CloseHandle(file_handle);
-  }
-
-  void close(Context &ctx) override {
-    Timer t(ctx, "close_file");
-
-    UnmapViewOfFile(this->buf);
-
-    if (!this->buf2.empty()) {
-      if (SetFilePointer(file_handle, 0, nullptr, FILE_END) ==
-          INVALID_SET_FILE_POINTER)
-        Fatal(ctx) << this->path
-                   << ": SetFilePointer failed: " << GetLastError();
-
-      DWORD written;
-      if (!WriteFile(file_handle, this->buf2.data(), this->buf2.size(),
-                     &written, nullptr))
-        Fatal(ctx) << this->path << ": WriteFile failed: " << GetLastError();
-    }
-
-    CloseHandle(file_handle);
-    file_handle = INVALID_HANDLE_VALUE;
-  }
-
-private:
-  HANDLE file_handle;
-};
-
-template <typename Context>
-std::unique_ptr<OutputFile<Context>>
-OutputFile<Context>::open(Context &ctx, std::string path, i64 filesize, i64 perm) {
-  Timer t(ctx, "open_file");
-
-  if (path.starts_with('/') && !ctx.arg.chroot.empty())
-    path = ctx.arg.chroot + "/" + path_clean(path);
-
-  bool is_special = false;
-  if (path == "-") {
-    is_special = true;
-  } else {
-    HANDLE file_handle =
-        CreateFileA(path.c_str(), GENERIC_READ,
-                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                    nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
-    if (file_handle != INVALID_HANDLE_VALUE) {
-      if (GetFileType(file_handle) != FILE_TYPE_DISK)
-        is_special = true;
-      CloseHandle(file_handle);
-    }
-  }
-
-  OutputFile<Context> *file;
-  if (is_special)
-    file = new MallocOutputFile(ctx, path, filesize, perm);
-  else
-    file = new MemoryMappedOutputFile(ctx, path, filesize, perm);
-
-  if (ctx.arg.filler != -1)
-    memset(file->buf, ctx.arg.filler, filesize);
-  return std::unique_ptr<OutputFile<Context>>(file);
-}
-
-} // namespace mold
diff --git a/common/output-file.h b/common/output-file.h
deleted file mode 100644
index 63299ed9..00000000
--- a/common/output-file.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#if _WIN32
-# include "output-file-win32.h"
-#else
-# include "output-file-unix.h"
-#endif
diff --git a/debian/changelog b/debian/changelog
index cedfce47..7e7ae693 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,13 @@
-mold (2.32.1+dfsg-3) UNRELEASED; urgency=medium
+mold (2.34.0+dfsg-1) unstable; urgency=medium
 
-  * Add --encoded-package-metadata option (pulled from upstream, #1308).
+  * New upstream release
+  * Fix two missing-license-paragraph-in-dep5-copyright warnings
+
+  [ Matthias Klose  ]
   * When no package-metadata option is given, fall-back to the
     envvar ELF_PACKAGE_METADATA.
 
- -- Matthias Klose <doko@debian.org>  Tue, 06 Aug 2024 13:29:29 +0200
+ -- Sylvestre Ledru <sylvestre@debian.org>  Wed, 25 Sep 2024 12:30:51 +0200
 
 mold (2.32.1+dfsg-2) unstable; urgency=medium
 
diff --git a/debian/copyright b/debian/copyright
index 54ec87d2..3c0b7c1e 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -9,7 +9,7 @@ Files-Excluded: third-party/mimalloc/bin/mimalloc-redirect.dll
                 third-party/zlib/contrib/dotzlib
 
 Files: *
-Copyright: 2020-2021 Rui Ueyama <ruiu@cs.stanford.edu>
+Copyright: 2020-2024 Rui Ueyama <ruiu@cs.stanford.edu>
 License: MIT
 
 Files: third-party/tbb/*
@@ -111,3 +111,33 @@ License: MIT
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  SOFTWARE.
+
+License: BSD-3-Clause
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ .
+ 1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ .
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ .
+ 3. Neither the name of the copyright holder nor the names of its contributors
+ may be used to endorse or promote products derived from this software without
+ specific prior written permission.
+ .
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+License: GPL-2+
+ On Debian systems, the full text of the GNU General Public License
+ version 2 can be found in the file '/usr/share/common-licenses/GPL-2'.
diff --git a/debian/patches/encoded-package-metadata.diff b/debian/patches/encoded-package-metadata.diff
deleted file mode 100644
index 7a7a8ed6..00000000
--- a/debian/patches/encoded-package-metadata.diff
+++ /dev/null
@@ -1,79 +0,0 @@
---- a/elf/cmdline.cc
-+++ b/elf/cmdline.cc
-@@ -119,6 +119,8 @@ Options:
-   --oformat=binary            Omit ELF, section, and program headers
-   --pack-dyn-relocs=[relr,none]
-                               Pack dynamic relocations
-+  --encoded-package-metadata=PERCENT_ENCODED_STRING
-+                              Set a given string to .note.package
-   --package-metadata=STRING   Set a given string to .note.package
-   --perf                      Print performance statistics
-   --pie, --pic-executable     Create a position-independent executable
-@@ -406,6 +408,49 @@ split_by_comma_or_colon(std::string_view
-   return vec;
- }
- 
-+/* Decode a hexadecimal character. Return -1 on error. */
-+static int hexdecode(char c) {
-+  if ('0' <= c && c <= '9')
-+    return c - '0';
-+  if ('A' <= c && c <= 'F')
-+    return c - 'A' + 10;
-+  if ('a' <= c && c <= 'f')
-+    return c - 'a' + 10;
-+  return -1;
-+}
-+
-+template <typename E>
-+static std::string parse_percent_encoded_string(Context<E> &ctx, std::string opt, std::string_view arg) {
-+  std::string decoded;
-+  int step = 1;
-+  for (i64 i = 0; i < arg.size(); i += step) {
-+    step = 1;
-+    if (arg[i] != '%') {
-+      decoded += arg[i];
-+      continue;
-+    }
-+    if (i + 1 > arg.size()) {
-+      Fatal(ctx) << "option --" << opt << ": invalid percent-encoded string: " << arg;
-+    }
-+    step++;
-+    if (arg[i+1] == '%') {
-+      decoded += '%';
-+      continue;
-+    }
-+    if (i + 2 > arg.size()) {
-+      Fatal(ctx) << "option --" << opt << ": invalid percent-encoded string: " << arg;
-+    }
-+    step++;
-+    int hex1 = hexdecode(arg[i+1]);
-+    int hex2 = hexdecode(arg[i+2]);
-+    if (hex1 == -1 || hex2 == -1) {
-+      Fatal(ctx) << "option --" << opt << ": invalid percent-encoded string: " << arg;
-+    }
-+    decoded += (char) ((hex1 << 4) + hex2);
-+  }
-+  return decoded;
-+}
-+
- template <typename E>
- static void read_retain_symbols_file(Context<E> &ctx, std::string_view path) {
-   MappedFile *mf = must_open_file(ctx, std::string(path));
-@@ -863,6 +908,8 @@ std::vector<std::string> parse_nonpositi
-     } else if (read_flag("pack-dyn-relocs=none") ||
-                read_z_flag("nopack-relative-relocs")) {
-       ctx.arg.pack_dyn_relocs_relr = false;
-+    } else if (read_arg("encoded-package-metadata")) {
-+      ctx.arg.package_metadata = parse_percent_encoded_string(ctx, "encoded-package-metadata", arg);
-     } else if (read_arg("package-metadata")) {
-       ctx.arg.package_metadata = arg;
-     } else if (read_flag("stats")) {
---- a/test/elf/package-metadata.sh
-+++ b/test/elf/package-metadata.sh
-@@ -10,3 +10,6 @@ EOF
- 
- $CC -B. -o $t/exe $t/a.o -Wl,-package-metadata='{"foo":"bar"}'
- readelf -x .note.package $t/exe | grep -Fq '{"foo":"bar"}'
-+
-+$CC -B. -o $t/exe2 $t/a.o -Wl,--encoded-package-metadata=%7B%22foo%22%3A%22bar%22%7D
-+readelf -x .note.package $t/exe2 | grep -Fq '{"foo":"bar"}'
diff --git a/debian/patches/env-package-metadata.diff b/debian/patches/env-package-metadata.diff
index d88f614a..d1b1b590 100644
--- a/debian/patches/env-package-metadata.diff
+++ b/debian/patches/env-package-metadata.diff
@@ -1,6 +1,8 @@
---- a/elf/cmdline.cc
-+++ b/elf/cmdline.cc
-@@ -1480,6 +1480,14 @@ std::vector<std::string> parse_nonpositi
+Index: mold/src/cmdline.cc
+===================================================================
+--- mold.orig/src/cmdline.cc
++++ mold/src/cmdline.cc
+@@ -1506,6 +1506,14 @@ std::vector<std::string> parse_nonpositi
        ctx.arg.dependency_file = ctx.arg.chroot + "/" + ctx.arg.dependency_file;
    }
  
diff --git a/debian/patches/fix-armhf-build.diff b/debian/patches/fix-armhf-build.diff
deleted file mode 100644
index 5462028d..00000000
--- a/debian/patches/fix-armhf-build.diff
+++ /dev/null
@@ -1,24 +0,0 @@
-From baf9ae9038dba56324e08e5df0023225a6067154 Mon Sep 17 00:00:00 2001
-From: Rui Ueyama <ruiu@cs.stanford.edu>
-Date: Tue, 16 Jul 2024 11:59:22 +0900
-Subject: [PATCH] Fix a test on Debian
-
-If the default linker doesn't complain, just skip the test.
-
-Fixes https://github.com/rui314/mold/issues/1301
----
- test/elf/arm_abs-error.sh | 2 ++
- 1 file changed, 2 insertions(+)
-
-Index: mold/test/elf/arm_abs-error.sh
-===================================================================
---- mold.orig/test/elf/arm_abs-error.sh
-+++ mold/test/elf/arm_abs-error.sh
-@@ -12,5 +12,7 @@ extern char foo;
- int main() { printf("foo=%p\n", &foo); }
- EOF
- 
-+$CC -o $t/exe -pie $t/a.o $t/b.o >& /dev/null && skip
-+
- ! $CC -B. -o $t/exe -pie $t/a.o $t/b.o >& $t/log
- grep -q 'recompile with -fPIC' $t/log
diff --git a/debian/patches/series b/debian/patches/series
index c5049b32..3b68d71e 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,3 +1 @@
-fix-armhf-build.diff
-encoded-package-metadata.diff
 env-package-metadata.diff
diff --git a/dist.sh b/dist.sh
index 6d2d698a..1137feaa 100755
--- a/dist.sh
+++ b/dist.sh
@@ -162,6 +162,8 @@ mkdir /build
 cd /build
 cmake -DCMAKE_BUILD_TYPE=Release -DMOLD_MOSTLY_STATIC=On /mold
 cmake --build . -j\$(nproc)
+mv mold mold2
+./mold2 -run cmake --build . -j\$(nproc)
 ctest -j\$(nproc)
 cmake --install . --prefix $dest --strip
 find $dest -print | xargs touch --no-dereference --date='$timestamp'
diff --git a/docs/design.md b/docs/design.md
index 62ea14e6..6bcf7004 100644
--- a/docs/design.md
+++ b/docs/design.md
@@ -1,3 +1,8 @@
+[This document was written in 2020, and the contents are outdated.
+Specifically, we no longer believe that object preloading is a good
+idea. That being said, most of the points in this document still hold
+even today. Therefore, I'll keep this document as-is.]
+
 ## Design and implementation of mold
 
 For the rest of this documentation, I'll explain the design and the
diff --git a/docs/mold.1 b/docs/mold.1
index 283a89a4..f0d9d1f7 100644
--- a/docs/mold.1
+++ b/docs/mold.1
@@ -1,6 +1,6 @@
 .\" generated with Ronn-NG/v0.9.1
 .\" http://github.com/apjanke/ronn-ng/tree/0.9.1
-.TH "MOLD" "1" "May 2024" ""
+.TH "MOLD" "1" "August 2024" ""
 .SH "NAME"
 \fBmold\fR \- a modern linker
 .SH "SYNOPSIS"
@@ -67,6 +67,9 @@ Synonym for \fB\-\-color\-diagnostics=auto\fR\.
 \fB\-\-no\-color\-diagnostics\fR
 Synonym for \fB\-\-color\-diagnostics=never\fR\.
 .TP
+\fB\-\-detach\fR, `\-\-no\-detach
+Permit or do not permit mold to create a debug info file in the background\.
+.TP
 \fB\-\-fork\fR, \fB\-\-no\-fork\fR
 Spawn a child process and let it do the actual linking\. When linking a large program, the OS kernel can take a few hundred milliseconds to terminate a \fBmold\fR process\. \fB\-\-fork\fR hides that latency\. By default, it does fork\.
 .TP
@@ -94,7 +97,16 @@ This option is useful for finding bugs that depend on the initialization order o
 By reversing the order of input sections using \fB\-\-reverse\-sections\fR, you can easily test that your program works in the reversed initialization order\.
 .TP
 \fB\-\-run\fR \fIcommand\fR \fIarg\fR\|\.\|\.\|\.
-Run \fIcommand\fR with \fBmold\fR \fB/usr/bin/ld\fR\. Specifically, \fBmold\fR runs a given command with the \fBLD_PRELOAD\fR environment set to intercept exec(3) family functions and replaces \fBargv[0]\fR with itself if it is \fBld\fR, \fBld\.gold\fR, or \fBld\.lld\fR\.
+Run \fIcommand\fR with \fBmold\fR as \fB/usr/bin/ld\fR\. Specifically, \fBmold\fR runs a given command with the \fBLD_PRELOAD\fR environment set to intercept exec(3) family functions and replaces \fBargv[0]\fR with itself if it is \fBld\fR, \fBld\.gold\fR, or \fBld\.lld\fR\.
+.TP
+\fB\-\-separate\-debug\-file\fR, \fB\-\-separate\-debug\-file\fR=\fIfile\fR
+Bundle debug info sections into a separate file instead of embedding them in an output executable or a shared library\. mold creates a debug info file in the background by default, so that you can start running your executable as soon as possible\.
+.IP
+By default, the debug info file is created in the same directory as is the output file, with the \fB\.dbg\fR file extension\. That filename is embedded into the output file so that \fBgdb\fR can automatically find the debug info file for the output file\. For more info about gdb features related to separate debug files, see \fIhttps://sourceware\.org/gdb/current/onlinedocs/gdb\.html/Separate\-Debug\-Files\.html\fR\.
+.IP
+mold holds a file lock with flock(2) while creating a debug info file in the background\.
+.IP
+If you don't want to create a debug info file in the background, pass the \fB\-\-no\-detach\fR option\.
 .TP
 \fB\-\-shuffle\-sections\fR, \fB\-\-shuffle\-sections\fR=\fInumber\fR
 Randomize the output by shuffling the order of input sections before assigning them the offsets in the output file\. If a \fInumber\fR is given, it's used as a seed for the random number generator, so that the linker produces the same output for the same seed\. If no seed is given, a random number is used as a seed\.
@@ -119,6 +131,17 @@ Use multiple threads\. By default, \fBmold\fR uses as many threads as the number
 .TP
 \fB\-\-quick\-exit\fR, \fB\-\-no\-quick\-exit\fR
 Use or do not use \fBquick_exit\fR to exit\.
+.TP
+\fB\-z rewrite\-endbr\fR, \fB\-z norewrite\-endbr\fR
+As a security measure, some CPU instruction sets have recently gained a feature to protect control flow integrity by disallowing indirect branches by default\. If the feature is enabled, the instruction that is executed immediately after an indirect branch must be an branch target marker instruction, or a CPU\-level fault will raise\. The marker instruction is also known as "landing pad" instruction, to which indirect branches can land\. This feature makes ROP attacks harder to conduct\.
+.IP
+To use the feature, a function whose pointer is taken needs to begin with a landing pad because a function call via a function pointer is compiled to an indirect branch\. On the other hand, if a function is called only directly (i\.e\. referred to only by \fIdirect\fR branch instructions), it doesn't have to begin with it\.
+.IP
+By default, the compiler always emits a landing pad at the beginning of each global function because it doesn't know whether or not the function's pointer is taken in another translation unit\. As a result, the resulting binary has more attack surface than necessary\.
+.IP
+If \fB\-\-rewrite\-endbr\fR is given, mold conducts a whole program analysis to identify functions whose addresses are actually taken and rewrites landing pads with no\-ops for non\-address\-taken functions, reducing the attack surface\.
+.IP
+This feature is currently available only on x86\-64\.
 .SH "GNU\-COMPATIBLE OPTIONS"
 .TP
 \fB\-\-help\fR
@@ -227,13 +250,20 @@ Alias for \fB\-\-section\-start=\.text=\fR\fIaddress\fR\.
 \fB\-\-allow\-multiple\-definition\fR
 Normally, the linker reports an error if there are more than one definition of a symbol\. This option changes the default behavior so that it doesn't report an error for duplicate definitions and instead use the first definition\.
 .TP
+\fB\-\-allow\-shlib\-undefined\fR, \fB\-\-no\-allow\-shlib\-undefined\fR
+Even if mold succeeds in linking a main executable without undefined symbol errors, you may still encounter symbol lookup errors at runtime because the dynamic linker cannot find some symbols in shared libraries in any ELF module\. This occurs because mold ignores undefined symbols in shared libraries by default\.
+.IP
+If you pass \fB\-\-no\-allow\-shlib\-undefined\fR, mold verifies that undefined symbols in shared libraries given to the linker can be resolved at link\-time\. In other words, this converts the runtime error to a link\-time error\.
+.IP
+Note that you need to pass all shared libraries, including indirectly dependent ones, to the linker as arguments for \fB\-l\fR\. If a shared library depends on a library that's not passed to the linker, the verification will be skipped for that file\.
+.TP
 \fB\-\-as\-needed\fR, \fB\-\-no\-as\-needed\fR
 By default, shared libraries given to the linker are unconditionally added to the list of required libraries in an output file\. However, shared libraries after \fB\-\-as\-needed\fR are added to the list only when at least one symbol is actually used by the output file\. In other words, shared libraries after \fB\-\-as\-needed\fR are not added to the list of needed libraries if they are not needed by a program\.
 .IP
 The \fB\-\-no\-as\-needed\fR option restores the default behavior for subsequent files\.
 .TP
-\fB\-\-build\-id\fR=[ \fBmd5\fR | \fBsha1\fR | \fBsha256\fR | \fBuuid\fR | \fB0x\fR\fIhexstring\fR | \fBnone\fR ]
-Create a \fB\.note\.gnu\.build\-id\fR section containing a byte string to uniquely identify an output file\. \fBsha256\fR compute a 256\-bit cryptographic hash of an output file and set it to build\-id\. \fBmd5\fR and \fBsha1\fR compute the same hash but truncate it to 128 and 160 bits, respectively, before setting it to build\-id\. \fBuuid\fR sets a random 128\-bit UUID\. \fB0x\fR\fIhexstring\fR sets \fIhexstring\fR\.
+\fB\-\-build\-id\fR=[ \fBmd5\fR | \fBsha1\fR | \fBsha256\fR | \fBfast\fR | \fBuuid\fR | \fB0x\fR\fIhexstring\fR | \fBnone\fR ]
+Create a \fB\.note\.gnu\.build\-id\fR section containing a byte string to uniquely identify an output file\. \fBsha256\fR compute a 256\-bit cryptographic hash of an output file and set it to build\-id\. \fBmd5\fR and \fBsha1\fR compute the same hash but truncate it to 128 and 160 bits, respectively, before setting it to build\-id\. \fBuuid\fR sets a random 128\-bit UUID\. \fB0x\fR\fIhexstring\fR sets \fIhexstring\fR\. \fBfast\fR is a synonym for \fBsha256\fR\.
 .TP
 \fB\-\-build\-id\fR
 Synonym for \fB\-\-build\-id=sha256\fR\.
@@ -272,10 +302,8 @@ The \fB\-\-emit\-relocs\fR instructs the linker to leave relocation sections in
 \fB\-\-enable\-new\-dtags\fR, \fB\-\-disable\-new\-dtags\fR
 By default, \fBmold\fR emits \fBDT_RUNPATH\fR for \fB\-\-rpath\fR\. If you pass \fB\-\-disable\-new\-dtags\fR, \fBmold\fR emits \fBDT_RPATH\fR for \fB\-\-rpath\fR instead\.
 .TP
-\fB\-\-execute\-only\fR
-Traditionally, most processors require both executable and readable bits to 1 to make the page executable, which allows machine code to be read as data at runtime\. This is actually what an attacker often does after gaining a limited control of a process to find pieces of machine code they can use to gain the full control of the process\. As a mitigation, some recent processors allows "execute\-only" pages\. If a page is execute\-only, you can call a function there as long as you know its address but can't read it as data\.
-.IP
-This option marks text segments execute\-only\. This option currently works only on some ARM64 processors\.
+\fB\-\-execute\-only\fR:
+
 .TP
 \fB\-\-exclude\-libs\fR=\fIlibraries\fR \|\.\|\.\|\.
 Mark all symbols in the given \fIlibraries\fR hidden\.
@@ -332,9 +360,6 @@ If \fBrelr\fR is specified, all \fBR_*_RELATIVE\fR relocations are put into \fB\
 .IP
 Note that a runtime loader has to support \fB\.relr\.dyn\fR to run executables or shared libraries linked with \fB\-\-pack\-dyn\-relocs=relr\fR\. As of 2022, only ChromeOS, Android and Fuchsia support it\.
 .TP
-\fB\-\-package\-metadata\fR=\fIstring\fR
-Embed \fIstring\fR to a \fB\.note\.package\fR section\. This option is intended to be used by a package management command such as rpm(8) to embed metadata regarding a package to each executable file\.
-.TP
 \fB\-\-pie\fR, \fB\-\-pic\-executable\fR, \fB\-\-no\-pie\fR, \fB\-\-no\-pic\-executable\fR
 Create a position\-independent executable\.
 .TP
@@ -499,7 +524,7 @@ Mark DSO to be initialized first at runtime\.
 \fB\-z interpose\fR
 Mark object to interpose all DSOs but executable\.
 .TP
-\fB\-(\fR, \fB\-)\fR, \fB\-EL\fR, \fB\-O\fR\fInumber\fR, \fB\-\-allow\-shlib\-undefined\fR, \fB\-\-dc\fR, \fB\-\-dp\fR, \fB\-\-end\-group\fR, \fB\-\-no\-add\-needed\fR, \fB\-\-no\-allow\-shlib\-undefined\fR, \fB\-\-no\-copy\-dt\-needed\-entries\fR, \fB\-\-nostdlib\fR, \fB\-\-rpath\-link=Ar dir\fR, \fB\-\-sort\-common\fR, \fB\-\-sort\-section\fR, \fB\-\-start\-group\fR, \fB\-\-warn\-constructors\fR, \fB\-\-warn\-once\fR, \fB\-\-fix\-cortex\-a53\-835769\fR, \fB\-\-fix\-cortex\-a53\-843419\fR, \fB\-z combreloc\fR, \fB\-z common\-page\-size\fR, \fB\-z nocombreloc\fR
+\fB\-(\fR, \fB\-)\fR, \fB\-EL\fR, \fB\-O\fR\fInumber\fR, \fB\-\-dc\fR, \fB\-\-dp\fR, \fB\-\-end\-group\fR, \fB\-\-no\-add\-needed\fR, \fB\-\-no\-copy\-dt\-needed\-entries\fR, \fB\-\-nostdlib\fR, \fB\-\-rpath\-link=Ar dir\fR, \fB\-\-sort\-common\fR, \fB\-\-sort\-section\fR, \fB\-\-start\-group\fR, \fB\-\-warn\-constructors\fR, \fB\-\-warn\-once\fR, \fB\-\-fix\-cortex\-a53\-835769\fR, \fB\-\-fix\-cortex\-a53\-843419\fR, \fB\-z combreloc\fR, \fB\-z common\-page\-size\fR, \fB\-z nocombreloc\fR
 Ignored
 .SH "ENVIRONMENT VARIABLES"
 .TP
diff --git a/docs/mold.md b/docs/mold.md
index 7ba64401..19e7c25b 100644
--- a/docs/mold.md
+++ b/docs/mold.md
@@ -152,6 +152,9 @@ but as `-o magic`.
 * `--no-color-diagnostics`:
   Synonym for `--color-diagnostics=never`.
 
+* `--detach`, `--no-detach:
+  Permit or do not permit mold to create a debug info file in the background.
+
 * `--fork`, `--no-fork`:
   Spawn a child process and let it do the actual linking. When linking a large
   program, the OS kernel can take a few hundred milliseconds to terminate a
@@ -198,10 +201,29 @@ but as `-o magic`.
   easily test that your program works in the reversed initialization order.
 
 * `--run` _command_ _arg_...:
-  Run _command_ with `mold` `/usr/bin/ld`. Specifically, `mold` runs a given
-  command with the `LD_PRELOAD` environment set to intercept exec(3) family
-  functions and replaces `argv[0]` with itself if it is `ld`, `ld.gold`, or
-  `ld.lld`.
+  Run _command_ with `mold` as `/usr/bin/ld`. Specifically, `mold` runs a
+  given command with the `LD_PRELOAD` environment set to intercept exec(3)
+  family functions and replaces `argv[0]` with itself if it is `ld`,
+  `ld.gold`, or `ld.lld`.
+
+* `--separate-debug-file`, `--separate-debug-file`=_file_:
+  Bundle debug info sections into a separate file instead of embedding them in
+  an output executable or a shared library. mold creates a debug info file in
+  the background by default, so that you can start running your executable as
+  soon as possible.
+
+  By default, the debug info file is created in the same directory as is the
+  output file, with the `.dbg` file extension. That filename is embedded into
+  the output file so that `gdb` can automatically find the debug info file for
+  the output file. For more info about gdb features related to separate debug
+  files, see
+  <https://sourceware.org/gdb/current/onlinedocs/gdb.html/Separate-Debug-Files.html>.
+
+  mold holds a file lock with flock(2) while creating a debug info file in the
+  background.
+
+  If you don't want to create a debug info file in the background, pass the
+  `--no-detach` option.
 
 * `--shuffle-sections`, `--shuffle-sections`=_number_:
   Randomize the output by shuffling the order of input sections before
@@ -246,6 +268,33 @@ but as `-o magic`.
 * `--quick-exit`, `--no-quick-exit`:
   Use or do not use `quick_exit` to exit.
 
+* `-z rewrite-endbr`, `-z norewrite-endbr`:
+  As a security measure, some CPU instruction sets have recently gained a
+  feature to protect control flow integrity by disallowing indirect branches
+  by default. If the feature is enabled, the instruction that is executed
+  immediately after an indirect branch must be an branch target marker
+  instruction, or a CPU-level fault will raise. The marker instruction is also
+  known as "landing pad" instruction, to which indirect branches can land.
+  This feature makes ROP attacks harder to conduct.
+
+  To use the feature, a function whose pointer is taken needs to begin with a
+  landing pad because a function call via a function pointer is compiled to an
+  indirect branch. On the other hand, if a function is called only directly
+  (i.e. referred to only by _direct_ branch instructions), it doesn't have to
+  begin with it.
+
+  By default, the compiler always emits a landing pad at the beginning of each
+  global function because it doesn't know whether or not the function's
+  pointer is taken in another translation unit. As a result, the resulting
+  binary has more attack surface than necessary.
+
+  If `--rewrite-endbr` is given, mold conducts a whole program analysis
+  to identify functions whose addresses are actually taken and rewrites
+  landing pads with no-ops for non-address-taken functions, reducing the
+  attack surface.
+
+  This feature is currently available only on x86-64.
+
 ## GNU-COMPATIBLE OPTIONS
 
 * `--help`:
@@ -390,6 +439,23 @@ but as `-o magic`.
   report an error for duplicate definitions and instead use the first
   definition.
 
+* `--allow-shlib-undefined`, `--no-allow-shlib-undefined`:
+  Even if mold succeeds in linking a main executable without undefined symbol
+  errors, you may still encounter symbol lookup errors at runtime because the
+  dynamic linker cannot find some symbols in shared libraries in any ELF
+  module. This occurs because mold ignores undefined symbols in shared
+  libraries by default.
+
+  If you pass `--no-allow-shlib-undefined`, mold verifies that undefined
+  symbols in shared libraries given to the linker can be resolved at
+  link-time. In other words, this converts the runtime error to a link-time
+  error.
+
+  Note that you need to pass all shared libraries, including indirectly
+  dependent ones, to the linker as arguments for `-l`. If a shared library
+  depends on a library that's not passed to the linker, the verification will
+  be skipped for that file.
+
 * `--as-needed`, `--no-as-needed`:
   By default, shared libraries given to the linker are unconditionally added
   to the list of required libraries in an output file. However, shared
@@ -401,13 +467,13 @@ but as `-o magic`.
   The `--no-as-needed` option restores the default behavior for subsequent
   files.
 
-* `--build-id`=[ `md5` | `sha1` | `sha256` | `uuid` | `0x`_hexstring_ | `none` ]:
+* `--build-id`=[ `md5` | `sha1` | `sha256` | `fast` | `uuid` | `0x`_hexstring_ | `none` ]:
   Create a `.note.gnu.build-id` section containing a byte string to uniquely
   identify an output file. `sha256` compute a 256-bit cryptographic hash of an
   output file and set it to build-id. `md5` and `sha1` compute the same hash
   but truncate it to 128 and 160 bits, respectively, before setting it to
   build-id. `uuid` sets a random 128-bit UUID. `0x`_hexstring_ sets
-  _hexstring_.
+  _hexstring_. `fast` is a synonym for `sha256`.
 
 * `--build-id`:
   Synonym for `--build-id=sha256`.
@@ -463,17 +529,22 @@ but as `-o magic`.
   `--disable-new-dtags`, `mold` emits `DT_RPATH` for `--rpath` instead.
 
 * `--execute-only`:
-  Traditionally, most processors require both executable and readable bits to
-  1 to make the page executable, which allows machine code to be read as data
-  at runtime. This is actually what an attacker often does after gaining a
-  limited control of a process to find pieces of machine code they can use to
-  gain the full control of the process. As a mitigation, some recent
-  processors allows "execute-only" pages. If a page is execute-only, you can
-  call a function there as long as you know its address but can't read it as
-  data.
-
-  This option marks text segments execute-only. This option currently works
-  only on some ARM64 processors.
+
+  Traditionally, setting the executable bit to 1 for a memory page implies
+  that the page also become readable, which allows machine code to be read
+  as data at runtime. That is actually what an attacker often does after
+  gaining a limited control of a process to find pieces of machine code
+  they can use to gain the full control of the process. As a mitigation,
+  recent processors including some ARM64 ones allows "execute-only" pages.
+  If a page is execute-only, you can call a function there as long as you
+  know its address but can't read it as data.
+
+  This option marks text segments as execute-only by setting just the "X"
+  bit instead of "RX". Note that on most systems, the absence of the "R"
+  bit in the text segment serves just as a hint. If you run a program
+  linked with `--execute-only` on a processor that doesn't support
+  execute-only pages, your executable will likely still function normally,
+  but the text segment will remain readable.
 
 * `--exclude-libs`=_libraries_ ...:
   Mark all symbols in the given _libraries_ hidden.
@@ -558,11 +629,6 @@ but as `-o magic`.
   shared libraries linked with `--pack-dyn-relocs=relr`. As of 2022, only
   ChromeOS, Android and Fuchsia support it.
 
-* `--package-metadata`=_string_:
-  Embed _string_ to a `.note.package` section. This option is intended to be
-  used by a package management command such as rpm(8) to embed metadata
-  regarding a package to each executable file.
-
 * `--pie`, `--pic-executable`, `--no-pie`, `--no-pic-executable`:
   Create a position-independent executable.
 
@@ -809,7 +875,7 @@ but as `-o magic`.
 * `-z interpose`:
   Mark object to interpose all DSOs but executable.
 
-* `-(`, `-)`, `-EL`, `-O`_number_, `--allow-shlib-undefined`, `--dc`, `--dp`, `--end-group`, `--no-add-needed`, `--no-allow-shlib-undefined`, `--no-copy-dt-needed-entries`, `--nostdlib`, `--rpath-link=Ar dir`, `--sort-common`, `--sort-section`, `--start-group`, `--warn-constructors`, `--warn-once`, `--fix-cortex-a53-835769`, `--fix-cortex-a53-843419`, `-z combreloc`, `-z common-page-size`, `-z nocombreloc`:
+* `-(`, `-)`, `-EL`, `-O`_number_, `--dc`, `--dp`, `--end-group`, `--no-add-needed`, `--no-copy-dt-needed-entries`, `--nostdlib`, `--rpath-link=Ar dir`, `--sort-common`, `--sort-section`, `--start-group`, `--warn-constructors`, `--warn-once`, `--fix-cortex-a53-835769`, `--fix-cortex-a53-843419`, `-z combreloc`, `-z common-page-size`, `-z nocombreloc`:
   Ignored
 
 ## ENVIRONMENT VARIABLES
@@ -832,6 +898,8 @@ but as `-o magic`.
   consider setting this environment variable to `1` to see if it addresses the
   OOM issue.
 
+  Currently, any value other than `1` is silently ignored.
+
 * `MOLD_DEBUG`:
   If this variable is set to a non-empty string, `mold` embeds its
   command-line options in the output file's `.comment` section.
diff --git a/elf/arch-alpha.cc b/elf/arch-alpha.cc
deleted file mode 100644
index d7189434..00000000
--- a/elf/arch-alpha.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-// Alpha is a 64-bit RISC ISA developed by DEC (Digital Equipment
-// Corporation) in the early '90s. It aimed to be an ISA that would last
-// 25 years. DEC expected Alpha would become 1000x faster during that time
-// span. Since the ISA was developed from scratch for future machines,
-// it's 64-bit from the beginning. There's no 32-bit variant.
-//
-// DEC ported its own Unix (Tru64) to Alpha. Microsoft also ported Windows
-// NT to it. But it wasn't a huge commercial success.
-//
-// DEC was acquired by Compaq in 1997. In the late '90s, Intel and
-// Hewlett-Packard were advertising that their upcoming Itanium processor
-// would achieve significantly better performance than RISC processors, so
-// Compaq decided to discontinue the Alpha processor line to switch to
-// Itanium. Itanium resulted in a miserable failure, but it still suceeded
-// to wipe out several RISC processors just by promising overly optimistic
-// perf numbers. Alpha as an ISA would probably have been fine after 25
-// years since its introduction (which is 1992 + 25 = 2017), but the
-// company and its market didn't last that long.
-//
-// From the linker's point of view, there are a few peculiarities in its
-// psABI as shown below:
-//
-//  - Alpha lacks PC-relative memory load/store instructions, so it uses
-//    register-relative load/store instructions in position-independent
-//    code. Specifically, GP (which is an alias for $r29) is always
-//    maintained to refer to .got+0x8000, and global variables' addresses
-//    are loaded in a GP-relative manner.
-//
-//  - It looks like even function addresses are first loaded to register
-//    in a GP-relative manner before calling it. We can relax it to
-//    convert the instruction sequence with a direct branch instruction,
-//    but by default, object files don't use a direct branch to call a
-//    function. Therefore, by default, we don't need to create a PLT.
-//    Any function call is made by first reading its address from GOT and
-//    jump to the address.
-
-#include "mold.h"
-
-namespace mold::elf {
-
-using E = ALPHA;
-
-// A 32-bit immediate can be materialized in a register with a "load high"
-// and a "load low" instruction sequence. The first instruction sets the
-// upper 16 bits in a register, and the second one set the lower 16
-// bits. When doing so, they sign-extend an immediate.  Therefore, if the
-// 15th bit of an immediate happens to be 1, setting a "low half" value
-// negates the upper 16 bit values that has already been set in a
-// register. To compensate that, we need to add 0x8000 when setting the
-// upper 16 bits.
-static u32 hi(u32 val) {
-  return bits(val + 0x8000, 31, 16);
-}
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
-
-template <>
-void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                       u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_ALPHA_SREL32:
-    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-    u64 GP = ctx.got->shdr.sh_addr + 0x8000;
-
-    switch (rel.r_type) {
-    case R_ALPHA_REFQUAD:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
-      break;
-    case R_ALPHA_GPREL32:
-      *(ul32 *)loc = S + A - GP;
-      break;
-    case R_ALPHA_LITERAL:
-      if (A)
-        *(ul16 *)loc = ctx.extra.got->get_addr(sym, A) - GP;
-      else
-        *(ul16 *)loc = GOT + G - GP;
-      break;
-    case R_ALPHA_BRSGP:
-      *(ul32 *)loc |= bits(S + A - P - 4, 22, 0);
-      break;
-    case R_ALPHA_GPDISP:
-      *(ul16 *)loc = hi(GP - P);
-      *(ul16 *)(loc + A) = GP - P;
-      break;
-    case R_ALPHA_SREL32:
-      *(ul32 *)loc = S + A - P;
-      break;
-    case R_ALPHA_GPRELHIGH:
-      *(ul16 *)loc = hi(S + A - GP);
-      break;
-    case R_ALPHA_GPRELLOW:
-      *(ul16 *)loc = S + A - GP;
-      break;
-    case R_ALPHA_TLSGD:
-      *(ul16 *)loc = sym.get_tlsgd_addr(ctx) - GP;
-      break;
-    case R_ALPHA_TLSLDM:
-      *(ul16 *)loc = ctx.got->get_tlsld_addr(ctx) - GP;
-      break;
-    case R_ALPHA_DTPRELHI:
-      *(ul16 *)loc = hi(S + A - ctx.dtp_addr);
-      break;
-    case R_ALPHA_DTPRELLO:
-      *(ul16 *)loc = S + A - ctx.dtp_addr;
-      break;
-    case R_ALPHA_GOTTPREL:
-      *(ul16 *)loc = sym.get_gottp_addr(ctx) + A - GP;
-      break;
-    case R_ALPHA_TPRELHI:
-      *(ul16 *)loc = hi(S + A - ctx.tp_addr);
-      break;
-    case R_ALPHA_TPRELLO:
-      *(ul16 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_ALPHA_LITUSE:
-    case R_ALPHA_HINT:
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_ALPHA_REFLONG:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul32 *)loc = *val;
-      else
-        *(ul32 *)loc = S + A;
-      break;
-    case R_ALPHA_REFQUAD:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul64 *)loc = *val;
-      else
-        *(ul64 *)loc = S + A;
-      break;
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      Error(ctx) << sym << ": GNU ifunc symbol is not supported on Alpha";
-
-    switch (rel.r_type) {
-    case R_ALPHA_REFQUAD:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_ALPHA_LITERAL:
-      if (rel.r_addend)
-        ctx.extra.got->add_symbol(sym, rel.r_addend);
-      else
-        sym.flags |= NEEDS_GOT;
-      break;
-    case R_ALPHA_SREL32:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_ALPHA_BRSGP:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_ALPHA_TLSGD:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_ALPHA_TLSLDM:
-      ctx.needs_tlsld = true;
-      break;
-    case R_ALPHA_GOTTPREL:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_ALPHA_TPRELHI:
-    case R_ALPHA_TPRELLO:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_ALPHA_GPREL32:
-    case R_ALPHA_LITUSE:
-    case R_ALPHA_GPDISP:
-    case R_ALPHA_HINT:
-    case R_ALPHA_GPRELHIGH:
-    case R_ALPHA_GPRELLOW:
-    case R_ALPHA_DTPRELHI:
-    case R_ALPHA_DTPRELLO:
-      break;
-    default:
-      Fatal(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-// An R_ALPHA_LITERAL relocation may request the linker to create a GOT
-// entry for an external symbol with a non-zero addend. This is an unusual
-// request which is not found in any other targets.
-//
-// Referring an external symbol with a non-zero addend is a bad practice
-// because we need to create as many dynamic relocations as the number of
-// distinctive addends for the same symbol.
-//
-// We don't want to mess up the implementation of the common GOT section
-// for Alpha. So we create another GOT-like section, .alpha_got. Any GOT
-// entry for an R_ALPHA_LITERAL reloc with a non-zero addend is created
-// not in .got but in .alpha_got.
-//
-// Since .alpha_got entries are accessed relative to GP, .alpha_got
-// needs to be close enough to .got. It's actually placed next to .got.
-void AlphaGotSection::add_symbol(Symbol<E> &sym, i64 addend) {
-  assert(addend);
-  std::scoped_lock lock(mu);
-  entries.push_back({&sym, addend});
-}
-
-bool operator<(const AlphaGotSection::Entry &a, const AlphaGotSection::Entry &b) {
-  return std::tuple(a.sym->file->priority, a.sym->sym_idx, a.addend) <
-         std::tuple(b.sym->file->priority, b.sym->sym_idx, b.addend);
-};
-
-u64 AlphaGotSection::get_addr(Symbol<E> &sym, i64 addend) {
-  auto it = std::lower_bound(entries.begin(), entries.end(), Entry{&sym, addend});
-  assert(it != entries.end());
-  return this->shdr.sh_addr + (it - entries.begin()) * sizeof(Word<E>);
-}
-
-i64 AlphaGotSection::get_reldyn_size(Context<E> &ctx) const {
-  i64 n = 0;
-  for (const Entry &e : entries)
-    if (e.sym->is_imported || (ctx.arg.pic && !e.sym->is_absolute()))
-      n++;
-  return n;
-}
-
-void AlphaGotSection::finalize() {
-  sort(entries);
-  remove_duplicates(entries);
-  shdr.sh_size = entries.size() * sizeof(Word<E>);
-}
-
-void AlphaGotSection::copy_buf(Context<E> &ctx) {
-  ElfRel<E> *dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                                    reldyn_offset);
-
-  for (i64 i = 0; i < entries.size(); i++) {
-    Entry &e = entries[i];
-    u64 P = this->shdr.sh_addr + sizeof(Word<E>) * i;
-    ul64 *buf = (ul64 *)(ctx.buf + this->shdr.sh_offset + sizeof(Word<E>) * i);
-
-    if (e.sym->is_imported) {
-      *buf = ctx.arg.apply_dynamic_relocs ? e.addend : 0;
-      *dynrel++ = ElfRel<E>(P, E::R_ABS, e.sym->get_dynsym_idx(ctx), e.addend);
-    } else {
-      *buf = e.sym->get_addr(ctx) + e.addend;
-      if (ctx.arg.pic && !e.sym->is_absolute())
-        *dynrel++ = ElfRel<E>(P, E::R_RELATIVE, 0, *buf);
-    }
-  }
-}
-
-} // namespace mold::elf
diff --git a/install-build-deps.sh b/install-build-deps.sh
index 8e5568fa..4ef3bac1 100755
--- a/install-build-deps.sh
+++ b/install-build-deps.sh
@@ -19,15 +19,12 @@ ubuntu-* | pop-* | linuxmint-* | debian-* | raspbian-*)
 fedora-* | amzn-* | rhel-*)
   dnf install -y gcc-g++ cmake glibc-static libstdc++-static diffutils util-linux
   ;;
-opensuse-leap-*)
-  zypper install -y make cmake gcc-c++ gcc11-c++ glibc-devel-static tar diffutils util-linux
-  ;;
-opensuse-tumbleweed-*)
+opensuse-*)
   zypper install -y make cmake gcc-c++ glibc-devel-static tar diffutils util-linux
   ;;
 gentoo-*)
   emerge-webrsync
-  emerge dev-build/cmake
+  FEATURES='getbinpkg binpkg-request-signature' emerge dev-build/cmake
   ;;
 arch-* | archarm-* | artix-* | endeavouros-*)
   pacman -Sy --needed --noconfirm base-devel cmake util-linux
@@ -43,6 +40,13 @@ clear-linux-*)
   swupd update
   swupd bundle-add c-basic diffutils
   ;;
+almalinux-*)
+  dnf install -y gcc-toolset-13-gcc-c++ gcc-toolset-13-libstdc++-devel cmake diffutils
+  ;;
+freebsd-*)
+  pkg update
+  pkg install -y cmake bash binutils gcc
+  ;;
 *)
   echo "Error: don't know anything about build dependencies on $ID-$VERSION_ID"
   exit 1
diff --git a/install-cross-tools.sh b/install-cross-tools.sh
index 86dc10dd..fcac8ef8 100755
--- a/install-cross-tools.sh
+++ b/install-cross-tools.sh
@@ -11,7 +11,7 @@ set -x
 
 case "$ID-$VERSION_ID" in
 ubuntu-* | pop-* | linuxmint-* | debian-* | raspbian-*)
-  apt-get install -y qemu-user {gcc,g++}-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha}-linux-gnu {gcc,g++}-arm-linux-gnueabihf
+  apt-get install -y qemu-user {gcc,g++}-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4}-linux-gnu {gcc,g++}-arm-linux-gnueabihf
   ;;
 *)
   echo "Error: don't know anything about build dependencies on $ID-$VERSION_ID"
diff --git a/common/archive-file.h b/lib/archive-file.h
similarity index 93%
rename from common/archive-file.h
rename to lib/archive-file.h
index 11158f49..9ce4a030 100644
--- a/common/archive-file.h
+++ b/lib/archive-file.h
@@ -26,7 +26,6 @@
 #pragma once
 
 #include "common.h"
-#include "filetype.h"
 
 namespace mold {
 
@@ -76,7 +75,7 @@ struct ArHdr {
   }
 };
 
-template <typename Context, typename MappedFile>
+template <typename Context>
 std::vector<MappedFile *>
 read_thin_archive_members(Context &ctx, MappedFile *mf) {
   u8 *begin = mf->data;
@@ -124,7 +123,7 @@ read_thin_archive_members(Context &ctx, MappedFile *mf) {
   return vec;
 }
 
-template <typename Context, typename MappedFile>
+template <typename Context>
 std::vector<MappedFile *> read_fat_archive_members(Context &ctx, MappedFile *mf) {
   u8 *begin = mf->data;
   u8 *data = begin + 8;
@@ -162,16 +161,13 @@ std::vector<MappedFile *> read_fat_archive_members(Context &ctx, MappedFile *mf)
   return vec;
 }
 
-template <typename Context, typename MappedFile>
+template <typename Context>
 std::vector<MappedFile *> read_archive_members(Context &ctx, MappedFile *mf) {
-  switch (get_file_type(ctx, mf)) {
-  case FileType::AR:
+  std::string_view str = mf->get_contents();
+  if (str.starts_with("!<arch>\n"))
     return read_fat_archive_members(ctx, mf);
-  case FileType::THIN_AR:
-    return read_thin_archive_members(ctx, mf);
-  default:
-    unreachable();
-  }
+  assert(str.starts_with("!<thin>\n"));
+  return read_thin_archive_members(ctx, mf);
 }
 
 } // namespace mold
diff --git a/common/common.h b/lib/common.h
similarity index 90%
rename from common/common.h
rename to lib/common.h
index 986448e1..d915c97e 100644
--- a/common/common.h
+++ b/lib/common.h
@@ -2,6 +2,7 @@
 
 #include "integers.h"
 
+#include <array>
 #include <atomic>
 #include <bit>
 #include <bitset>
@@ -61,8 +62,6 @@ namespace mold {
 using namespace std::literals::string_literals;
 using namespace std::literals::string_view_literals;
 
-template <typename Context> class OutputFile;
-
 inline char *output_tmpfile;
 
 inline u8 *output_buffer_start = nullptr;
@@ -75,7 +74,7 @@ std::string get_self_path();
 void cleanup();
 void install_signal_handler();
 
-static u64 combine_hash(u64 a, u64 b) {
+inline u64 combine_hash(u64 a, u64 b) {
   return a ^ (b + 0x9e3779b9 + (a << 6) + (a >> 2));
 }
 
@@ -442,10 +441,9 @@ inline i64 write_string(void *buf, std::string_view str) {
 }
 
 template <typename T>
-inline i64 write_vector(void *buf, const std::vector<T> &vec) {
-  i64 sz = vec.size() * sizeof(T);
-  memcpy(buf, vec.data(), sz);
-  return sz;
+inline void write_vector(void *buf, const std::vector<T> &vec) {
+  if (!vec.empty())
+    memcpy(buf, vec.data(), vec.size() * sizeof(T));
 }
 
 inline void encode_uleb(std::vector<u8> &vec, u64 val) {
@@ -525,23 +523,6 @@ inline void overwrite_uleb(u8 *loc, u64 val) {
   *loc = val & 0b0111'1111;
 }
 
-template <typename Context>
-std::string_view save_string(Context &ctx, const std::string &str) {
-  u8 *buf = new u8[str.size() + 1];
-  memcpy(buf, str.data(), str.size());
-  buf[str.size()] = '\0';
-  ctx.string_pool.push_back(std::unique_ptr<u8[]>(buf));
-  return {(char *)buf, str.size()};
-}
-
-inline bool remove_prefix(std::string_view &s, std::string_view prefix) {
-  if (s.starts_with(prefix)) {
-    s = s.substr(prefix.size());
-    return true;
-  }
-  return false;
-}
-
 static inline void pause() {
 #if defined(__x86_64__)
   asm volatile("pause");
@@ -726,74 +707,6 @@ class ConcurrentMap {
 
 void get_random_bytes(u8 *buf, i64 size);
 
-//
-// output-file.h
-//
-
-template <typename Context>
-class OutputFile {
-public:
-  static std::unique_ptr<OutputFile<Context>>
-  open(Context &ctx, std::string path, i64 filesize, i64 perm);
-
-  virtual void close(Context &ctx) = 0;
-  virtual ~OutputFile() = default;
-
-  u8 *buf = nullptr;
-  std::vector<u8> buf2;
-  std::string path;
-  i64 fd = -1;
-  i64 filesize = 0;
-  bool is_mmapped = false;
-  bool is_unmapped = false;
-
-protected:
-  OutputFile(std::string path, i64 filesize, bool is_mmapped)
-    : path(path), filesize(filesize), is_mmapped(is_mmapped) {}
-};
-
-template <typename Context>
-class MallocOutputFile : public OutputFile<Context> {
-public:
-  MallocOutputFile(Context &ctx, std::string path, i64 filesize, i64 perm)
-    : OutputFile<Context>(path, filesize, false), ptr(new u8[filesize]),
-      perm(perm) {
-    this->buf = ptr.get();
-  }
-
-  void close(Context &ctx) override {
-    Timer t(ctx, "close_file");
-    FILE *fp;
-
-    if (this->path == "-") {
-      fp = stdout;
-    } else {
-#ifdef _WIN32
-      int pmode = (perm & 0200) ? (_S_IREAD | _S_IWRITE) : _S_IREAD;
-      i64 fd = _open(this->path.c_str(), _O_RDWR | _O_CREAT | _O_BINARY, pmode);
-#else
-      i64 fd = ::open(this->path.c_str(), O_RDWR | O_CREAT, perm);
-#endif
-      if (fd == -1)
-        Fatal(ctx) << "cannot open " << this->path << ": " << errno_string();
-#ifdef _WIN32
-      fp = _fdopen(fd, "wb");
-#else
-      fp = fdopen(fd, "w");
-#endif
-    }
-
-    fwrite(this->buf, this->filesize, 1, fp);
-    if (!this->buf2.empty())
-      fwrite(this->buf2.data(), this->buf2.size(), 1, fp);
-    fclose(fp);
-  }
-
-private:
-  std::unique_ptr<u8[]> ptr;
-  i64 perm;
-};
-
 //
 // hyperloglog.cc
 //
@@ -899,6 +812,13 @@ std::optional<std::string_view> demangle_rust(std::string_view name);
 void acquire_global_lock();
 void release_global_lock();
 
+//
+// crc32.cc
+//
+
+u32 compute_crc32(u32 crc, u8 *buf, i64 len);
+std::vector<u8> crc32_solve(u32 current, u32 desired);
+
 //
 // compress.cc
 //
diff --git a/common/compress.cc b/lib/compress.cc
similarity index 100%
rename from common/compress.cc
rename to lib/compress.cc
diff --git a/common/config.h.in b/lib/config.h.in
similarity index 100%
rename from common/config.h.in
rename to lib/config.h.in
diff --git a/lib/crc32.cc b/lib/crc32.cc
new file mode 100644
index 00000000..d3f71783
--- /dev/null
+++ b/lib/crc32.cc
@@ -0,0 +1,60 @@
+#include "common.h"
+
+#include <tbb/parallel_for_each.h>
+#include <zlib.h>
+
+namespace mold {
+
+// This function "forges" a CRC. That is, given the current and a desired
+// CRC32 value, crc32_solve() returns a binary blob to add to the end of
+// the original data to yield the desired CRC. Trailing garbage is ignored
+// by many bianry file formats, so you can create a file with a desired
+// CRC using crc32_solve(). We need it for --separate-debug-file.
+std::vector<u8> crc32_solve(u32 current, u32 desired) {
+  constexpr u32 poly = 0xedb88320;
+  u32 x = ~desired;
+
+  // Each iteration computes x = (x * x^-1) mod poly.
+  for (i64 i = 0; i < 32; i++) {
+    x = std::rotl(x, 1);
+    x ^= (x & 1) * (poly << 1);
+  }
+
+  x ^= ~current;
+
+  std::vector<u8> out(4);
+  out[0] = x;
+  out[1] = x >> 8;
+  out[2] = x >> 16;
+  out[3] = x >> 24;
+  return out;
+}
+
+// Compute a CRC for given data in parallel
+u32 compute_crc32(u32 crc, u8 *buf, i64 len) {
+  struct Shard {
+    u8 *buf;
+    i64 len;
+    u32 crc;
+  };
+
+  constexpr i64 shard_size = 1024 * 1024; // 1 MiB
+  std::vector<Shard> shards;
+
+  while (len > 0) {
+    i64 sz = std::min(len, shard_size);
+    shards.push_back({buf, sz, 0});
+    buf += sz;
+    len -= sz;
+  }
+
+  tbb::parallel_for_each(shards.begin(), shards.end(), [](Shard &shard) {
+    shard.crc = crc32(0, shard.buf, shard.len);
+  });
+
+  for (Shard &shard : shards)
+    crc = crc32_combine(crc, shard.crc, shard.len);
+  return crc;
+}
+
+} // namespace mold
diff --git a/common/demangle.cc b/lib/demangle.cc
similarity index 100%
rename from common/demangle.cc
rename to lib/demangle.cc
diff --git a/common/filepath.cc b/lib/filepath.cc
similarity index 100%
rename from common/filepath.cc
rename to lib/filepath.cc
diff --git a/test/gentoo-test.sh b/lib/gentoo-test.sh
similarity index 89%
rename from test/gentoo-test.sh
rename to lib/gentoo-test.sh
index fd5c4ca8..dbdae006 100755
--- a/test/gentoo-test.sh
+++ b/lib/gentoo-test.sh
@@ -26,12 +26,13 @@ if ! docker image ls mold-gentoo | grep -q mold-gentoo; then
   cat <<EOF | docker build -t mold-gentoo -
 FROM gentoo/stage3
 RUN emerge-webrsync
-RUN echo 'USE="X ssl elogind -systemd corefonts truetype jpeg jpeg2k tiff zstd static-libs binary"' >> /etc/portage/make.conf && \
+RUN echo 'USE="X ssl elogind -systemd corefonts truetype jpeg jpeg2k tiff zstd static-libs binary -perl"' >> /etc/portage/make.conf && \
     echo 'ACCEPT_KEYWORDS="~amd64"' >> /etc/portage/make.conf && \
     echo 'ACCEPT_LICENSE="* -@EULA"' >> /etc/portage/make.conf && \
     echo 'FEATURES="\${FEATURE} noclean nostrip ccache -ipc-sandbox -network-sandbox -pid-sandbox -sandbox"' >> /etc/portage/make.conf && \
-    echo 'CCACHE_DIR="/ccache"' >> /etc/portage/make.conf
-RUN emerge gdb lld clang vim emacs strace ccache xeyes dev-build/cmake dev-vcs/git && rm -rf /var/tmp/portage
+    echo 'CCACHE_DIR="/ccache"' >> /etc/portage/make.conf && \
+    emerge gdb lld clang vim emacs strace ccache xeyes dev-build/cmake dev-vcs/git && \
+    rm -rf /var/tmp/portage
 EOF
   set +e
 fi
diff --git a/common/glob.cc b/lib/glob.cc
similarity index 100%
rename from common/glob.cc
rename to lib/glob.cc
diff --git a/common/hyperloglog.cc b/lib/hyperloglog.cc
similarity index 100%
rename from common/hyperloglog.cc
rename to lib/hyperloglog.cc
diff --git a/lib/integers.h b/lib/integers.h
new file mode 100644
index 00000000..11582f70
--- /dev/null
+++ b/lib/integers.h
@@ -0,0 +1,144 @@
+// This file defines integral types for file input/output. We need to use
+// these types instead of the plain integers (such as uint32_t or int32_t)
+// when reading from/writing to an mmap'ed file area for the following
+// reasons:
+//
+// 1. mold is always a cross linker and should not depend on what host it
+//    is running on. For example, users should be able to run mold on a
+//    big-endian SPARC machine to create a little-endian RV64 binary.
+//
+// 2. Even though data members in all ELF data strucutres are naturally
+//    aligned, they are not guaranteed to be aligned on memory because of
+//    archive files. Archive files (.a files) align each member only to a
+//    2 byte boundary, so anything larger than 2 bytes may be misaligned
+//    in an mmap'ed memory. Misaligned access is an undefined behavior in
+//    C/C++, so we shouldn't cast an arbitrary pointer to a uint32_t, for
+//    example, to read a 32 bit value.
+//
+// The data types defined in this file don't depend on host byte order and
+// don't do unaligned access.
+
+#pragma once
+
+#include <bit>
+#include <cstdint>
+#include <cstring>
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#  define __LITTLE_ENDIAN__ 1
+# elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#  define __BIG_ENDIAN__ 1
+# else
+#  error "unknown host byte order"
+# endif
+#endif
+
+namespace mold {
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t i8;
+typedef int16_t i16;
+typedef int32_t i32;
+typedef int64_t i64;
+
+template <typename T, std::endian endian, int size = sizeof(T)>
+class Integer {
+public:
+  constexpr Integer() = default;
+
+  constexpr Integer(T x) requires (endian == std::endian::little && size == 2)
+    : buf{(u8)x, (u8)(x >> 8)} {}
+
+  constexpr Integer(T x) requires (endian == std::endian::little && size == 3)
+    : buf{(u8)x, (u8)(x >> 8), (u8)(x >> 16)} {}
+
+  constexpr Integer(T x) requires (endian == std::endian::little && size == 4)
+    : buf{(u8)x, (u8)(x >> 8), (u8)(x >> 16), (u8)(x >> 24)} {}
+
+  constexpr Integer(T x) requires (endian == std::endian::little && size == 8)
+    : buf{(u8)x,         (u8)(x >> 8),  (u8)(x >> 16), (u8)(x >> 24),
+          (u8)(x >> 32), (u8)(x >> 40), (u8)(x >> 48), (u8)(x >> 56)} {}
+
+  constexpr Integer(T x) requires (endian == std::endian::big && size == 2)
+    : buf{(u8)(x >> 8), (u8)x} {}
+
+  constexpr Integer(T x) requires (endian == std::endian::big && size == 3)
+    : buf{(u8)(x >> 16), (u8)(x >> 8), (u8)x} {}
+
+  constexpr Integer(T x) requires (endian == std::endian::big && size == 4)
+    : buf{(u8)(x >> 24), (u8)(x >> 16), (u8)(x >> 8), (u8)x} {}
+
+  constexpr Integer(T x) requires (endian == std::endian::big && size == 8)
+    : buf{(u8)(x >> 56), (u8)(x >> 48), (u8)(x >> 40), (u8)(x >> 32),
+          (u8)(x >> 24), (u8)(x >> 16), (u8)(x >> 8),  (u8)x} {}
+
+  Integer &operator=(T x) {
+    new (this) Integer(x);
+    return *this;
+  }
+
+  operator T() const {
+    if constexpr (endian == std::endian::little) {
+      if constexpr (size == 2)
+        return buf[1] << 8 | buf[0];
+      else if constexpr (size == 3)
+        return buf[2] << 16 | buf[1] << 8 | buf[0];
+      else if constexpr (size == 4)
+        return buf[3] << 24 | buf[2] << 16 | buf[1] << 8 | buf[0];
+      else
+        return (u64)buf[7] << 56 | (u64)buf[6] << 48 |
+               (u64)buf[5] << 40 | (u64)buf[4] << 32 |
+               (u64)buf[3] << 24 | (u64)buf[2] << 16 |
+               (u64)buf[1] << 8  | (u64)buf[0];
+    } else {
+      if constexpr (size == 2)
+        return buf[0] << 8 | buf[1];
+      else if constexpr (size == 3)
+        return buf[0] << 16 | buf[1] << 8 | buf[2];
+      else if constexpr (size == 4)
+        return buf[0] << 24 | buf[1] << 16 | buf[2] << 8 | buf[3];
+      else
+        return (u64)buf[0] << 56 | (u64)buf[1] << 48 |
+               (u64)buf[2] << 40 | (u64)buf[3] << 32 |
+               (u64)buf[4] << 24 | (u64)buf[5] << 16 |
+               (u64)buf[6] << 8  | (u64)buf[7];
+    }
+  }
+
+  Integer &operator++()    { return *this = *this + 1; }
+  Integer operator++(int)  { return ++*this - 1; }
+  Integer &operator--()    { return *this = *this - 1; }
+  Integer operator--(int)  { return --*this + 1; }
+  Integer &operator+=(T x) { return *this = *this + x; }
+  Integer &operator-=(T x) { return *this = *this - x; }
+  Integer &operator&=(T x) { return *this = *this & x; }
+  Integer &operator|=(T x) { return *this = *this | x; }
+
+private:
+  u8 buf[size];
+};;
+
+using il16 = Integer<i16, std::endian::little>;
+using il32 = Integer<i32, std::endian::little>;
+using il64 = Integer<i64, std::endian::little>;
+
+using ul16 = Integer<u16, std::endian::little>;
+using ul24 = Integer<u32, std::endian::little, 3>;
+using ul32 = Integer<u32, std::endian::little>;
+using ul64 = Integer<u64, std::endian::little>;
+
+using ib16 = Integer<i16, std::endian::big>;
+using ib32 = Integer<i32, std::endian::big>;
+using ib64 = Integer<i64, std::endian::big>;
+
+using ub16 = Integer<u16, std::endian::big>;
+using ub24 = Integer<u32, std::endian::big, 3>;
+using ub32 = Integer<u32, std::endian::big>;
+using ub64 = Integer<u64, std::endian::big>;
+
+} // namespace mold
diff --git a/lib/jobs-unix.cc b/lib/jobs-unix.cc
new file mode 100644
index 00000000..9912ab52
--- /dev/null
+++ b/lib/jobs-unix.cc
@@ -0,0 +1,50 @@
+// Many build systems attempt to invoke as many linker processes as there
+// are cores, based on the assumption that the linker is single-threaded.
+// However, since mold is multi-threaded, such build systems' behavior is
+// not beneficial and just increases the overall peak memory usage.
+// On machines with limited memory, this could lead to an out-of-memory
+// error.
+//
+// This file implements a feature that limits the number of concurrent
+// mold processes to just 1 for each user. It is intended to be used as
+// `MOLD_JOBS=1 ninja` or `MOLD_JOBS=1 make -j$(nproc)`.
+
+#include "common.h"
+
+#include <fcntl.h>
+#include <pwd.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace mold {
+
+static int lock_fd = -1;
+
+void acquire_global_lock() {
+  char *jobs = getenv("MOLD_JOBS");
+  if (!jobs || jobs != "1"s)
+    return;
+
+  std::string path;
+  if (char *dir = getenv("XDG_RUNTIME_DIR"))
+    path = dir + "/mold-lock"s;
+  else
+    path = "/tmp/mold-lock-"s + getpwuid(getuid())->pw_name;
+
+  int fd = open(path.c_str(), O_WRONLY | O_CREAT | O_CLOEXEC, 0600);
+  if (fd == -1)
+    return;
+
+  if (lockf(fd, F_LOCK, 0) == -1)
+    return;
+  lock_fd = fd;
+}
+
+void release_global_lock() {
+  if (lock_fd != -1)
+    close(lock_fd);
+}
+
+} // namespace mold
diff --git a/common/jobs-win32.cc b/lib/jobs-win32.cc
similarity index 100%
rename from common/jobs-win32.cc
rename to lib/jobs-win32.cc
diff --git a/common/malloc.cc b/lib/malloc.cc
similarity index 100%
rename from common/malloc.cc
rename to lib/malloc.cc
diff --git a/common/mapped-file-unix.cc b/lib/mapped-file-unix.cc
similarity index 100%
rename from common/mapped-file-unix.cc
rename to lib/mapped-file-unix.cc
diff --git a/common/mapped-file-win32.cc b/lib/mapped-file-win32.cc
similarity index 100%
rename from common/mapped-file-win32.cc
rename to lib/mapped-file-win32.cc
diff --git a/common/multi-glob.cc b/lib/multi-glob.cc
similarity index 100%
rename from common/multi-glob.cc
rename to lib/multi-glob.cc
diff --git a/common/perf.cc b/lib/perf.cc
similarity index 100%
rename from common/perf.cc
rename to lib/perf.cc
diff --git a/common/random.cc b/lib/random.cc
similarity index 100%
rename from common/random.cc
rename to lib/random.cc
diff --git a/common/signal-unix.cc b/lib/signal-unix.cc
similarity index 100%
rename from common/signal-unix.cc
rename to lib/signal-unix.cc
diff --git a/common/signal-win32.cc b/lib/signal-win32.cc
similarity index 100%
rename from common/signal-win32.cc
rename to lib/signal-win32.cc
diff --git a/common/siphash.h b/lib/siphash.h
similarity index 100%
rename from common/siphash.h
rename to lib/siphash.h
diff --git a/common/tar.cc b/lib/tar.cc
similarity index 98%
rename from common/tar.cc
rename to lib/tar.cc
index 5c0692d4..30f464bc 100644
--- a/common/tar.cc
+++ b/lib/tar.cc
@@ -1,3 +1,5 @@
+// This file contains functions to create a tar file.
+
 #include "common.h"
 
 #ifdef _WIN32
diff --git a/common/update-git-hash.cmake b/lib/update-git-hash.cmake
similarity index 100%
rename from common/update-git-hash.cmake
rename to lib/update-git-hash.cmake
diff --git a/elf/arch-arm32.cc b/src/arch-arm32.cc
similarity index 90%
rename from elf/arch-arm32.cc
rename to src/arch-arm32.cc
index 291824b5..7ef37392 100644
--- a/elf/arch-arm32.cc
+++ b/src/arch-arm32.cc
@@ -37,7 +37,7 @@
 #include <tbb/parallel_for_each.h>
 #include <tbb/parallel_sort.h>
 
-namespace mold::elf {
+namespace mold {
 
 using E = ARM32;
 
@@ -194,7 +194,7 @@ void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
 
 template <>
 void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn[] = {
+  constexpr ul32 insn[] = {
     0xe52d'e004, //    push {lr}
     0xe59f'e004, //    ldr lr, 2f
     0xe08f'e00e, // 1: add lr, pc, lr
@@ -209,7 +209,7 @@ void write_plt_header(Context<E> &ctx, u8 *buf) {
   *(ul32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 16;
 }
 
-static const ul32 plt_entry[] = {
+constexpr ul32 plt_entry[] = {
   0xe59f'c004, // 1: ldr ip, 2f
   0xe08c'c00f, //    add ip, ip, pc
   0xe59c'f000, //    ldr pc, [ip]
@@ -256,11 +256,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   auto get_tls_trampoline_addr = [&, i = 0](u64 addr) mutable {
     for (; i < output_section->thunks.size(); i++) {
       i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
@@ -299,7 +294,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     switch (rel.r_type) {
     case R_ARM_ABS32:
     case R_ARM_TARGET1:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_ARM_REL32:
       *(ul32 *)loc = S + A - P;
@@ -489,19 +483,21 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       //  .L2: .word   foo + . - .L1
       //           R_ARM_TLS_GOTDESC
       //
-      // We may relax the instructions to the following for non-dlopen'd DSO
+      // We may relax the instructions to the following if its TP-relative
+      // address is known at link-time
       //
       //       ldr     r0, .L2
-      //  .L1: ldr r0, [pc, r0]
+      //  .L1: nop
       //       ...
-      //  .L2: .word   foo(gottpoff) + . - .L1
+      //  .L2: .word   foo(tpoff)
       //
-      // or to the following for executable.
+      // or to the following if the TP-relative address is known at
+      // process startup time.
       //
       //       ldr     r0, .L2
-      //  .L1: nop
+      //  .L1: ldr r0, [pc, r0]
       //       ...
-      //  .L2: .word   foo(tpoff)
+      //  .L2: .word   foo(gottpoff) + . - .L1
       if (sym.has_tlsdesc(ctx)) {
         // A is odd if the corresponding TLS_CALL is Thumb.
         *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - ((A & 1) ? 6 : 4);
@@ -584,8 +580,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -600,12 +594,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       sym.flags |= NEEDS_GOT | NEEDS_PLT;
 
     switch (rel.r_type) {
-    case R_ARM_ABS32:
-    case R_ARM_MOVT_ABS:
-    case R_ARM_THM_MOVT_ABS:
-    case R_ARM_TARGET1:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_ARM_MOVW_ABS_NC:
     case R_ARM_THM_MOVW_ABS_NC:
       scan_absrel(ctx, sym, rel);
@@ -644,6 +632,10 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_ARM_TLS_LE32:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_ARM_ABS32:
+    case R_ARM_MOVT_ABS:
+    case R_ARM_THM_MOVT_ABS:
+    case R_ARM_TARGET1:
     case R_ARM_REL32:
     case R_ARM_BASE_PREL:
     case R_ARM_GOTOFF32:
@@ -666,7 +658,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
   // TLS trampoline code. ARM32's TLSDESC is designed so that this
   // common piece of code is factored out from object files to reduce
   // output size. Since no one provide, the linker has to synthesize it.
-  static ul32 hdr[] = {
+  constexpr ul32 hdr[] = {
     0xe08e'0000, // add r0, lr, r0
     0xe590'1004, // ldr r1, [r0, #4]
     0xe12f'ff11, // bx  r1
@@ -675,7 +667,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
 
   // This is a range extension and mode switch thunk.
   // It has two entry points: +0 for Thumb and +4 for ARM.
-  const u8 entry[] = {
+  static const u8 entry[] = {
     // .thumb
     0x78, 0x47,             //    bx   pc  # jumps to 1f
     0xc0, 0x46,             //    nop
@@ -708,6 +700,45 @@ u64 get_eflags(Context<E> &ctx) {
   return EF_ARM_EABI_VER5;
 }
 
+void create_arm_exidx_section(Context<E> &ctx) {
+  for (i64 i = 0; i < ctx.chunks.size(); i++) {
+    OutputSection<E> *osec = ctx.chunks[i]->to_osec();
+
+    if (osec && osec->shdr.sh_type == SHT_ARM_EXIDX) {
+      auto *sec = new Arm32ExidxSection(*osec);
+      ctx.extra.exidx = sec;
+      ctx.chunks[i] = sec;
+      ctx.chunk_pool.emplace_back(sec);
+
+      for (InputSection<E> *isec : osec->members)
+        isec->is_alive = false;
+      break;
+    }
+  }
+}
+
+void Arm32ExidxSection::compute_section_size(Context<E> &ctx) {
+  output_section.compute_section_size(ctx);
+  this->shdr.sh_size = output_section.shdr.sh_size;
+}
+
+void Arm32ExidxSection::update_shdr(Context<E> &ctx) {
+  // .ARM.exidx's sh_link should be set to the .text section index.
+  // Runtime doesn't care about it, but the binutils's strip command does.
+  if (Chunk<E> *chunk = find_chunk(ctx, ".text"))
+    this->shdr.sh_link = chunk->shndx;
+}
+
+void Arm32ExidxSection::remove_duplicate_entries(Context<E> &ctx) {
+  this->shdr.sh_size = get_contents(ctx).size();
+}
+
+void Arm32ExidxSection::copy_buf(Context<E> &ctx) {
+  std::vector<u8> contents = get_contents(ctx);
+  assert(this->shdr.sh_size = contents.size());
+  write_vector(ctx.buf + this->shdr.sh_offset, contents);
+}
+
 // ARM executables use an .ARM.exidx section to look up an exception
 // handling record for the current instruction pointer. The table needs
 // to be sorted by their addresses.
@@ -716,13 +747,12 @@ u64 get_eflags(Context<E> &ctx) {
 // I don't know why only ARM uses the different mechanism, but it's
 // likely that it's due to some historical reason.
 //
-// This function sorts .ARM.exidx records.
-void fixup_arm_exidx_section(Context<E> &ctx) {
-  Timer t(ctx, "fixup_arm_exidx_section");
+// This function returns contents of .ARM.exidx.
+std::vector<u8> Arm32ExidxSection::get_contents(Context<E> &ctx) {
+  std::vector<u8> buf(output_section.shdr.sh_size);
 
-  OutputSection<E> *osec = find_section(ctx, SHT_ARM_EXIDX);
-  if (!osec)
-    return;
+  output_section.shdr.sh_addr = this->shdr.sh_addr;
+  output_section.write_to(ctx, buf.data(), nullptr);
 
   // .ARM.exidx records consists of a signed 31-bit relative address
   // and a 32-bit value. The relative address indicates the start
@@ -736,24 +766,24 @@ void fixup_arm_exidx_section(Context<E> &ctx) {
   //
   // CANTUNWIND is value 1. The most significant bit is set in (2) but
   // not in (3). So we can distinguished them just by looking at a value.
-  const u32 EXIDX_CANTUNWIND = 1;
+  const u32 CANTUNWIND = 1;
 
   struct Entry {
     ul32 addr;
     ul32 val;
   };
 
-  if (osec->shdr.sh_size % sizeof(Entry))
+  if (buf.size() % sizeof(Entry))
     Fatal(ctx) << "invalid .ARM.exidx section size";
 
-  Entry *ent = (Entry *)(ctx.buf + osec->shdr.sh_offset);
-  i64 num_entries = osec->shdr.sh_size / sizeof(Entry);
+  Entry *ent = (Entry *)buf.data();
+  i64 num_entries = buf.size() / sizeof(Entry);
 
   // Entry's addresses are relative to themselves. In order to sort
-  // records by addresses, we first translate them so that the addresses
+  // records by address, we first translate them so that the addresses
   // are relative to the beginning of the section.
   auto is_relative = [](u32 val) {
-    return val != EXIDX_CANTUNWIND && !(val & 0x8000'0000);
+    return val != CANTUNWIND && !(val & 0x8000'0000);
   };
 
   tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
@@ -763,10 +793,21 @@ void fixup_arm_exidx_section(Context<E> &ctx) {
       ent[i].val = 0x7fff'ffff & (ent[i].val + offset);
   });
 
-  tbb::parallel_sort(ent, ent + num_entries, [](const Entry &a, const Entry &b) {
+  std::sort(ent, ent + num_entries, [](const Entry &a, const Entry &b) {
     return a.addr < b.addr;
   });
 
+  // Remove duplicate adjacent entries. That is, if two adjacent functions
+  // have the same compact unwind info or are both CANTUNWIND, we can
+  // merge them into a single address range.
+  auto it = std::unique(ent, ent + num_entries,
+                        [](const Entry &a, const Entry &b) {
+    return a.val == b.val;
+  });
+
+  num_entries = it - ent;
+  buf.resize(num_entries * sizeof(Entry));
+
   // Make addresses relative to themselves.
   tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
     i64 offset = sizeof(Entry) * i;
@@ -775,14 +816,7 @@ void fixup_arm_exidx_section(Context<E> &ctx) {
       ent[i].val = 0x7fff'ffff & (ent[i].val - offset);
   });
 
-  // .ARM.exidx's sh_link should be set to the .text section index.
-  // Runtime doesn't care about it, but the binutils's strip command does.
-  if (ctx.shdr) {
-    if (Chunk<E> *text = find_section(ctx, ".text")) {
-      osec->shdr.sh_link = text->shndx;
-      ctx.shdr->copy_buf(ctx);
-    }
-  }
+  return buf;
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-arm64.cc b/src/arch-arm64.cc
similarity index 96%
rename from elf/arch-arm64.cc
rename to src/arch-arm64.cc
index 90e0bd71..6fc237b8 100644
--- a/elf/arch-arm64.cc
+++ b/src/arch-arm64.cc
@@ -19,7 +19,7 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = ARM64;
 
@@ -46,7 +46,7 @@ static u64 page(u64 val) {
 
 template <>
 void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn[] = {
+  constexpr ul32 insn[] = {
     0xa9bf'7bf0, // stp  x16, x30, [sp,#-16]!
     0x9000'0010, // adrp x16, .got.plt[2]
     0xf940'0211, // ldr  x17, [x16, .got.plt[2]]
@@ -68,7 +68,7 @@ void write_plt_header(Context<E> &ctx, u8 *buf) {
 
 template <>
 void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static const ul32 insn[] = {
+  constexpr ul32 insn[] = {
     0x9000'0010, // adrp x16, .got.plt[n]
     0xf940'0211, // ldr  x17, [x16, .got.plt[n]]
     0x9100'0210, // add  x16, x16, .got.plt[n]
@@ -86,7 +86,7 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
 
 template <>
 void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static const ul32 insn[] = {
+  constexpr ul32 insn[] = {
     0x9000'0010, // adrp x16, GOT[n]
     0xf940'0211, // ldr  x17, [x16, GOT[n]]
     0xd61f'0220, // br   x17
@@ -145,11 +145,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -173,7 +168,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
     switch (rel.r_type) {
     case R_AARCH64_ABS64:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_AARCH64_LDST8_ABS_LO12_NC:
     case R_AARCH64_ADD_ABS_LO12_NC:
@@ -383,19 +377,21 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       //   blr     x1
       //       R_AARCH64_TLSDESC_CALL       foo
       //
-      // We may relax the instructions to the following for non-dlopen'd DSO
+      // We may relax the instructions to the following if its TP-relative
+      // address is known at link-time
       //
       //   nop
       //   nop
-      //   adrp    x0, :gottprel:foo
-      //   ldr     x0, [x0, :gottprel_lo12:foo]
+      //   movz    x0, :tls_offset_hi:foo, lsl #16
+      //   movk    x0, :tls_offset_lo:foo
       //
-      // or to the following for executable.
+      // or to the following if the TP-relative address is known at
+      // process startup time.
       //
       //   nop
       //   nop
-      //   movz    x0, :tls_offset_hi:foo, lsl #16
-      //   movk    x0, :tls_offset_lo:foo
+      //   adrp    x0, :gottprel:foo
+      //   ldr     x0, [x0, :gottprel_lo12:foo]
       if (sym.has_tlsdesc(ctx)) {
         i64 val = page(sym.get_tlsdesc_addr(ctx) + A) - page(P);
         check(val, -(1LL << 32), 1LL << 32);
@@ -488,8 +484,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -505,9 +499,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       sym.flags |= NEEDS_GOT | NEEDS_PLT;
 
     switch (rel.r_type) {
-    case R_AARCH64_ABS64:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_AARCH64_MOVW_UABS_G3:
       scan_absrel(ctx, sym, rel);
       break;
@@ -567,6 +558,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_AARCH64_ABS64:
     case R_AARCH64_ADD_ABS_LO12_NC:
     case R_AARCH64_ADR_PREL_LO21:
     case R_AARCH64_CONDBR19:
@@ -603,7 +595,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
 
 template <>
 void Thunk<E>::copy_buf(Context<E> &ctx) {
-  static const ul32 insn[] = {
+  constexpr ul32 insn[] = {
     0x9000'0010, // adrp x16, 0   # R_AARCH64_ADR_PREL_PG_HI21
     0x9100'0210, // add  x16, x16 # R_AARCH64_ADD_ABS_LO12_NC
     0xd61f'0200, // br   x16
@@ -626,4 +618,4 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
   }
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-i386.cc b/src/arch-i386.cc
similarity index 95%
rename from elf/arch-i386.cc
rename to src/arch-i386.cc
index 1494d98d..008faaf0 100644
--- a/elf/arch-i386.cc
+++ b/src/arch-i386.cc
@@ -35,7 +35,7 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = I386;
 
@@ -226,7 +226,7 @@ static void relax_gd_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
 }
 
 // Relax LD to LE
-static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
+static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 tls_size) {
   switch (rel.r_type) {
   case R_386_PLT32:
   case R_386_PC32: {
@@ -235,7 +235,7 @@ static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
       0x2d, 0, 0, 0, 0,       // sub $tls_size, %eax
     };
     memcpy(loc - 2, insn, sizeof(insn));
-    *(ul32 *)(loc + 5) = val;
+    *(ul32 *)(loc + 5) = tls_size;
     break;
   }
   case R_386_GOT32:
@@ -246,7 +246,7 @@ static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
       0x90,                   // nop
     };
     memcpy(loc - 2, insn, sizeof(insn));
-    *(ul32 *)(loc + 5) = val;
+    *(ul32 *)(loc + 5) = tls_size;
     break;
   }
   default:
@@ -286,11 +286,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -322,7 +317,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       *(ul16 *)loc = S + A;
       break;
     case R_386_32:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_386_PC8:
       check(S + A - P, -(1 << 7), 1 << 7);
@@ -374,7 +368,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_386_TLS_LDM:
       if (ctx.got->has_tlsld(ctx))
         *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
-        else
+      else
         relax_ld_to_le(loc, rels[++i], ctx.tp_addr - ctx.tls_begin);
       break;
     case R_386_TLS_LDO_32:
@@ -392,14 +386,16 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       //   call   *(%eax)
       //       R_386_TLS_DESC_CALL foo
       //
-      // We may relax the instructions to the following for non-dlopen'd DSO
+      // We may relax the instructions to the following if its TP-relative
+      // address is known at link-time
       //
-      //   mov     foo@GOTTPOFF(%ebx), %eax
+      //   mov     $foo@TPOFF, %eax
       //   nop
       //
-      // or to the following for executable.
+      // or to the following if the TP-relative address is known at
+      // process startup time.
       //
-      //   mov     $foo@TPOFF, %eax
+      //   mov     foo@GOTTPOFF(%ebx), %eax
       //   nop
       //
       // We allow the following alternative code sequence too because
@@ -518,8 +514,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -549,9 +543,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_386_16:
       scan_absrel(ctx, sym, rel);
       break;
-    case R_386_32:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_386_PC8:
     case R_386_PC16:
     case R_386_PC32:
@@ -581,8 +572,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_386_TLS_GD:
       // We always relax if -static because libc.a doesn't contain
       // __tls_get_addr().
-      if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) ||
-          ctx.arg.is_static)
+      if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx)))
         i++;
       else
         sym.flags |= NEEDS_TLSGD;
@@ -590,7 +580,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_386_TLS_LDM:
       // We always relax if -static because libc.a doesn't contain
       // __tls_get_addr().
-      if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared))
+      if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared))
         i++;
       else
         ctx.needs_tlsld = true;
@@ -601,6 +591,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_386_TLS_LE:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_386_32:
     case R_386_GOTOFF:
     case R_386_TLS_LDO_32:
     case R_386_SIZE32:
@@ -612,4 +603,4 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
   }
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-loongarch.cc b/src/arch-loongarch.cc
similarity index 53%
rename from elf/arch-loongarch.cc
rename to src/arch-loongarch.cc
index 54ce7e44..dda138e9 100644
--- a/elf/arch-loongarch.cc
+++ b/src/arch-loongarch.cc
@@ -10,21 +10,22 @@
 // bootstrapping the entire ecosystem for LoongArch, sending patches to
 // Linux, GCC, LLVM, etc.
 //
-// All instructions are 4 bytes long in LoongArch and aligned to 4-byte
-// boundaries. It has 32 general-purpose registers. Among these, $t0 - $t8
-// (aliases for $r12 - $r20) are temporary registers that we can use in
-// our PLT and range extension thunks.
+// Speaking of the ISA, all instructions are 4 byte long and aligned to 4
+// byte boundaries in LoongArch. It has 32 general-purpose registers.
+// Among these, $t0 - $t8 (aliases for $r12 - $r20) are temporary
+// registers that we can use in our PLT.
 //
-// The psABI defines a few linker relaxations. We haven't supported them
-// yet.
+// Just like RISC-V, LoongArch supports section-shrinking relaxations.
+// That is, it allows linkers to rewrite certain instruction sequences to
+// shorter ones. Sections are not an atomic unit of copying.
 //
-// https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html
+// https://github.com/loongson/la-abi-specs/blob/release/laelf.adoc
 
 #if MOLD_LOONGARCH64 || MOLD_LOONGARCH32
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = MOLD_TARGET;
 
@@ -50,7 +51,7 @@ static u64 hi20(u64 val, u64 pc) {
   return bits(page(val + 0x800) - page(pc), 31, 12);
 }
 
-static u64 hi64(u64 val, u64 pc) {
+static u64 higher20(u64 val, u64 pc) {
   // A PC-relative 64-bit address is materialized with the following
   // instructions for the large code model:
   //
@@ -64,21 +65,15 @@ static u64 hi64(u64 val, u64 pc) {
   // ADDI.D adds a sign-extended 12 bit value to a register. LU32I.D and
   // LU52I.D simply set bits to [51:31] and to [63:53], respectively.
   //
-  // Compensating all the sign-extensions is a bit complicated.
-  u64 x = page(val) - page(pc);
-  if (val & 0x800)
-    x += 0x1000 - 0x1'0000'0000;
-  if (x & 0x8000'0000)
-    x += 0x1'0000'0000;
-  return x;
-}
-
-static u64 higher20(u64 val, u64 pc) {
-  return bits(hi64(val, pc), 51, 32);
+  // Compensating all the sign-extensions is a bit complicated. The
+  // psABI gave the following formula.
+  val = val + 0x8000'0000 + ((val & 0x800) ? (0x1000 - 0x1'0000'0000) : 0);
+  return bits(page(val) - page(pc - 8), 51, 32);
 }
 
 static u64 highest12(u64 val, u64 pc) {
-  return bits(hi64(val, pc), 63, 52);
+  val = val + 0x8000'0000 + ((val & 0x800) ? (0x1000 - 0x1'0000'0000) : 0);
+  return bits(page(val) - page(pc - 12), 63, 52);
 }
 
 static void write_k12(u8 *loc, u32 val) {
@@ -113,9 +108,47 @@ static void write_d10k16(u8 *loc, u32 val) {
   *(ul32 *)loc |= bits(val, 25, 16);
 }
 
+static u32 get_rd(u32 insn) {
+  return bits(insn, 4, 0);
+}
+
+static u32 get_rj(u32 insn) {
+  return bits(insn, 9, 5);
+}
+
+static void set_rj(u8 *loc, u32 rj) {
+  assert(rj < 32);
+  *(ul32 *)loc &= 0b111111'1111111111111111'00000'11111;
+  *(ul32 *)loc |= rj << 5;
+}
+
+// Returns true if isec's i'th relocation refers to the following
+// relaxable instructioon pair.
+//
+//   pcalau12i $t0, 0         # R_LARCH_GOT_PC_HI20
+//   ld.d      $t0, $t0, 0    # R_LARCH_GOT_PC_LO12
+static bool is_relaxable_got_load(Context<E> &ctx, InputSection<E> &isec, i64 i) {
+  std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
+  Symbol<E> &sym = *isec.file.symbols[rels[i].r_sym];
+
+  if (ctx.arg.relax &&
+      sym.is_pcrel_linktime_const(ctx) &&
+      i + 3 < rels.size() &&
+      rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 &&
+      rels[i + 2].r_offset == rels[i].r_offset + 4 &&
+      rels[i + 3].r_type == R_LARCH_RELAX) {
+    u32 insn1 = *(ul32 *)(isec.contents.data() + rels[i].r_offset);
+    u32 insn2 = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4);
+    bool is_ld_d = (insn2 & 0xffc0'0000) == 0x28c0'0000;
+    return get_rd(insn1) == get_rd(insn2) && get_rd(insn2) == get_rj(insn2) &&
+           is_ld_d;
+  }
+  return false;
+}
+
 template <>
 void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn_64[] = {
+  constexpr ul32 insn_64[] = {
     0x1a00'000e, // pcalau12i $t2, %pc_hi20(.got.plt)
     0x0011'bdad, // sub.d     $t1, $t1, $t3
     0x28c0'01cf, // ld.d      $t3, $t2, %lo12(.got.plt) # _dl_runtime_resolve
@@ -126,7 +159,7 @@ void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
     0x4c00'01e0, // jr        $t3
   };
 
-  static const ul32 insn_32[] = {
+  constexpr ul32 insn_32[] = {
     0x1a00'000e, // pcalau12i $t2, %pc_hi20(.got.plt)
     0x0011'3dad, // sub.w     $t1, $t1, $t3
     0x2880'01cf, // ld.w      $t3, $t2, %lo12(.got.plt) # _dl_runtime_resolve
@@ -146,18 +179,18 @@ void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
   write_k12(buf + 16, gotplt);
 }
 
-static const ul32 plt_entry_64[] = {
+constexpr ul32 plt_entry_64[] = {
   0x1a00'000f, // pcalau12i $t3, %pc_hi20(func@.got.plt)
   0x28c0'01ef, // ld.d      $t3, $t3, %lo12(func@.got.plt)
   0x4c00'01ed, // jirl      $t1, $t3, 0
-  0x0340'0000, // nop
+  0x002a'0000, // break
 };
 
-static const ul32 plt_entry_32[] = {
+constexpr ul32 plt_entry_32[] = {
   0x1a00'000f, // pcalau12i $t3, %pc_hi20(func@.got.plt)
   0x2880'01ef, // ld.w      $t3, $t3, %lo12(func@.got.plt)
   0x4c00'01ed, // jirl      $t1, $t3, 0
-  0x0340'0000, // nop
+  0x002a'0000, // break
 };
 
 template <>
@@ -233,10 +266,9 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
+  auto get_r_delta = [&](i64 idx) {
+    return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
+  };
 
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
@@ -247,7 +279,9 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       continue;
 
     Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
+    i64 r_offset = rel.r_offset - get_r_delta(i);
+    i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
+    u8 *loc = base + r_offset;
 
     auto check = [&](i64 val, i64 lo, i64 hi) {
       if (val < lo || hi <= val)
@@ -268,32 +302,28 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     // ones. Therefore, G may refer to a TLSGD or a regular GOT slot
     // depending on the symbol type.
     //
-    // Note that as of August 2023, both GCC and Clang treat TLSLD relocs
-    // as if they were TLSGD relocs for LoongArch, which is a clear bug.
-    // We need to handle TLSLD relocs as synonyms for TLSGD relocs for the
-    // sake of bug compatibility.
-    auto get_got_idx = [&] {
-      if (sym.has_tlsgd(ctx))
-        return sym.get_tlsgd_idx(ctx);
-      return sym.get_got_idx(ctx);
-    };
+    // Note that even though LoongArch defines relocations for TLSLD, TLSLD
+    // is not actually supported on it. GCC and LLVM emit identical machine
+    // code for -ftls-model=global-dynamic and -ftls-model=local-dynamic,
+    // and we need to handle TLSLD relocations as equivalent to TLSGD
+    // relocations. This is clearly a compiler bug, but it's too late to
+    // fix. The only way to fix it would be to define a new set of
+    // relocations for true TLSLD and deprecate the current ones. But it
+    // appears that migrating to TLSDESC is a better choice, so it's
+    // unlikely to happen.
+    i64 got_idx =
+      sym.has_tlsgd(ctx) ? sym.get_tlsgd_idx(ctx) : sym.get_got_idx(ctx);
 
     u64 S = sym.get_addr(ctx);
     u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = get_got_idx() * sizeof(Word<E>);
+    u64 P = get_addr() + r_offset;
+    u64 G = got_idx * sizeof(Word<E>);
     u64 GOT = ctx.got->shdr.sh_addr;
 
     switch (rel.r_type) {
     case R_LARCH_32:
       if constexpr (E::is_64)
         *(ul32 *)loc = S + A;
-      else
-        apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
-      break;
-    case R_LARCH_64:
-      assert(E::is_64);
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_LARCH_B16:
       check_branch(S + A - P, -(1 << 17), 1 << 17);
@@ -303,13 +333,10 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       check_branch(S + A - P, -(1 << 22), 1 << 22);
       write_d5k16(loc, (S + A - P) >> 2);
       break;
-    case R_LARCH_B26: {
-      i64 val = S + A - P;
-      if (val < -(1 << 27) || (1 << 27) <= val)
-        val = get_thunk_addr(i) + A - P;
-      write_d10k16(loc, val >> 2);
+    case R_LARCH_B26:
+      check_branch(S + A - P, -(1 << 27), 1 << 27);
+      write_d10k16(loc, (S + A - P) >> 2);
       break;
-    }
     case R_LARCH_ABS_LO12:
       write_k12(loc, S + A);
       break;
@@ -333,7 +360,15 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
         write_k12(loc, S + A);
       break;
     case R_LARCH_PCALA_HI20:
-      write_j20(loc, hi20(S + A, P));
+      if (removed_bytes == 0) {
+        write_j20(loc, hi20(S + A, P));
+      } else {
+        // Rewrite pcalau12i + addi.d with pcaddi
+        assert(removed_bytes == 4);
+        *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi
+        write_j20(loc, (S + A - P) >> 2);
+        i += 3;
+      }
       break;
     case R_LARCH_PCALA64_LO20:
       write_j20(loc, higher20(S + A, P));
@@ -345,7 +380,37 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       write_k12(loc, GOT + G + A);
       break;
     case R_LARCH_GOT_PC_HI20:
-      write_j20(loc, hi20(GOT + G + A, P));
+      if (removed_bytes == 0) {
+        // If the PC-relative symbol address is known at link-time, we can
+        // rewrite the following GOT load
+        //
+        //   pcalau12i $t0, 0         # R_LARCH_GOT_PC_HI20
+        //   ld.d      $t0, $t0, 0    # R_LARCH_GOT_PC_LO12
+        //
+        // with the following address materialization
+        //
+        //   pcalau12i $t0, 0
+        //   addi.d    $t0, $t0, 0
+        if (is_relaxable_got_load(ctx, *this, i)) {
+          i64 dist = compute_distance(ctx, sym, *this, rel);
+          if (-(1LL << 31) <= dist && dist < (1LL << 31)) {
+            u32 rd = get_rd(*(ul32 *)loc);
+            *(ul32 *)(loc + 4) = 0x02c0'0000 | (rd << 5) | rd; // addi.d
+
+            write_j20(loc, hi20(S + A, P));
+            write_k12(loc + 4, S + A);
+            i += 3;
+            break;
+          }
+        }
+        write_j20(loc, hi20(GOT + G + A, P));
+      } else {
+        // Rewrite pcalau12i + ld.d with pcaddi
+        assert(removed_bytes == 4);
+        *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi
+        write_j20(loc, (S + A - P) >> 2);
+        i += 3;
+      }
       break;
     case R_LARCH_GOT64_PC_LO20:
       write_j20(loc, higher20(GOT + G + A, P));
@@ -401,13 +466,13 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_LARCH_TLS_IE64_HI12:
       write_k12(loc, (sym.get_gottp_addr(ctx) + A) >> 52);
       break;
-    case R_LARCH_TLS_LD_PC_HI20:
     case R_LARCH_TLS_GD_PC_HI20:
+    case R_LARCH_TLS_LD_PC_HI20:
       check(sym.get_tlsgd_addr(ctx) + A - P, -(1LL << 31), 1LL << 31);
       write_j20(loc, hi20(sym.get_tlsgd_addr(ctx) + A, P));
       break;
-    case R_LARCH_TLS_LD_HI20:
     case R_LARCH_TLS_GD_HI20:
+    case R_LARCH_TLS_LD_HI20:
       write_j20(loc, (sym.get_tlsgd_addr(ctx) + A) >> 12);
       break;
     case R_LARCH_ADD6:
@@ -446,12 +511,148 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_LARCH_64_PCREL:
       *(ul64 *)loc = S + A - P;
       break;
+    case R_LARCH_CALL36:
+      if (removed_bytes == 0) {
+        write_j20(loc, (S + A - P + 0x20000) >> 18);
+        write_k16(loc + 4, (S + A - P) >> 2);
+      } else {
+        // Rewrite PCADDU18I + JIRL to B or BL
+        assert(removed_bytes == 4);
+        if (get_rd(*(ul32 *)(contents.data() + rel.r_offset + 4)) == 0)
+          *(ul32 *)loc = 0x5000'0000; // B
+        else
+          *(ul32 *)loc = 0x5400'0000; // BL
+        write_d10k16(loc, (S + A - P) >> 2);
+      }
+      break;
     case R_LARCH_ADD_ULEB128:
       overwrite_uleb(loc, read_uleb(loc) + S + A);
       break;
     case R_LARCH_SUB_ULEB128:
       overwrite_uleb(loc, read_uleb(loc) - S - A);
       break;
+    case R_LARCH_TLS_DESC_PC_HI20:
+      // LoongArch TLSDESC uses the following code sequence to materialize
+      // a TP-relative address in a0.
+      //
+      //   pcalau12i $a0, 0
+      //       R_LARCH_TLS_DESC_PC_HI20    foo
+      //   addi.[dw] $a0, $a0, 0
+      //       R_LARCH_TLS_DESC_PC_LO12    foo
+      //   ld.d      $ra, $a0, 0
+      //       R_LARCH_TLS_DESC_LD         foo
+      //   jirl      $ra, $ra, 0
+      //       R_LARCH_TLS_DESC_CALL       foo
+      //
+      // We may relax the instructions to the following if its TP-relative
+      // address is known at link-time
+      //
+      //   <nop>
+      //   <nop>
+      //   lu12i.w   $a0, foo@TPOFF
+      //   addi.w    $a0, $a0, foo@TPOFF
+      //
+      // or to the following if the TP offset is small enough.
+      //
+      //   <nop>
+      //   <nop>
+      //   <nop>
+      //   ori       $a0, $zero, foo@TPOFF
+      //
+      // If the TP-relative address is known at process startup time, we
+      // may relax the instructions to the following.
+      //
+      //   <nop>
+      //   <nop>
+      //   pcalau12i $a0, foo@GOTTP
+      //   ld.[dw]   $a0, $a0, foo@GOTTP
+      //
+      // If we don't know anything about the symbol, we can still relax
+      // the first two instructions to a single pcaddi as shown below.
+      //
+      //   <nop>
+      //   pcaddi    $a0, foo@GOTDESC
+      //   ld.d      $ra, $a0, 0
+      //   jirl      $ra, $ra, 0
+      //
+      // Note that if section-shrinking relaxation is enabled, nop may be
+      // completely deleted.
+      if (removed_bytes == 0) {
+        if (sym.has_tlsdesc(ctx)) {
+          i64 dist = sym.get_tlsdesc_addr(ctx) + A - P;
+          if (ctx.arg.relax && -(1 << 21) <= dist && dist < (1 << 21)) {
+            *(ul32 *)loc = 0x0340'0000; // nop
+          } else {
+            write_j20(loc, hi20(sym.get_tlsdesc_addr(ctx) + A, P));
+          }
+        } else {
+          *(ul32 *)loc = 0x0340'0000; // nop
+        }
+      }
+      break;
+    case R_LARCH_TLS_DESC_PC_LO12:
+      if (removed_bytes == 0) {
+        if (sym.has_tlsdesc(ctx)) {
+          i64 dist = sym.get_tlsdesc_addr(ctx) + A - P;
+          if (ctx.arg.relax && -(1 << 21) <= dist && dist < (1 << 21)) {
+            // If we can directly materialize the PC-relative address
+            // with pcaddi, do that.
+            *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi
+            write_j20(loc, dist >> 2);
+          } else {
+            write_k12(loc, sym.get_tlsdesc_addr(ctx) + A);
+          }
+        } else {
+          *(ul32 *)loc = 0x0340'0000; // nop
+        }
+      }
+      break;
+    case R_LARCH_TLS_DESC_LD:
+      if (sym.has_tlsdesc(ctx) || removed_bytes == 4) {
+        // Do nothing
+      } else if (sym.has_gottp(ctx)) {
+        *(ul32 *)loc = 0x1a00'0004; // pcalau12i $a0, 0
+        write_j20(loc, hi20(sym.get_gottp_addr(ctx) + A, P));
+      } else {
+        *(ul32 *)loc = 0x1400'0004; // lu12i.w   $a0, 0
+        write_j20(loc, (S + A + 0x800 - ctx.tp_addr) >> 12);
+      }
+      break;
+    case R_LARCH_TLS_DESC_CALL:
+      if (sym.has_tlsdesc(ctx)) {
+        // Do nothing
+      } else if (sym.has_gottp(ctx)) {
+        if (E::is_64)
+          *(ul32 *)loc = 0x28c0'0084; // ld.d $a0, $a0, 0
+        else
+          *(ul32 *)loc = 0x2880'0084; // ld.w $a0, $a0, 0
+        write_k12(loc, sym.get_gottp_addr(ctx) + A);
+      } else {
+        i64 val = S + A - ctx.tp_addr;
+        if (val < 0x1000)
+          *(ul32 *)loc = 0x0380'0004; // ori    $a0, $zero, 0
+        else
+          *(ul32 *)loc = 0x0280'0084; // addi.w $a0, $a0, 0
+        write_k12(loc, val);
+      }
+      break;
+    case R_LARCH_TLS_LE_HI20_R:
+      if (removed_bytes == 0)
+        write_j20(loc, (S + A + 0x800 - ctx.tp_addr) >> 12);
+      break;
+    case R_LARCH_TLS_LE_LO12_R: {
+      i64 val = S + A - ctx.tp_addr;
+      write_k12(loc, val);
+
+      // Rewrite `addi.d $t0, $t0, <offset>` with `addi.d $t0, $tp, <offset>`
+      // if the offset is directly accessible using tp. tp is r2.
+      if (sign_extend(val, 11) == val)
+        set_rj(loc, 2);
+      break;
+    }
+    case R_LARCH_64:
+    case R_LARCH_TLS_LE_ADD_R:
+      break;
     default:
       unreachable();
     }
@@ -551,8 +752,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -576,15 +775,10 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_LARCH_32:
       if constexpr (E::is_64)
         scan_absrel(ctx, sym, rel);
-      else
-        scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_LARCH_64:
-      assert(E::is_64);
-      scan_dyn_absrel(ctx, sym, rel);
       break;
     case R_LARCH_B26:
     case R_LARCH_PCALA_HI20:
+    case R_LARCH_CALL36:
       if (sym.is_imported)
         sym.flags |= NEEDS_PLT;
       break;
@@ -596,10 +790,10 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_LARCH_TLS_IE_PC_HI20:
       sym.flags |= NEEDS_GOTTP;
       break;
-    case R_LARCH_TLS_LD_PC_HI20:
     case R_LARCH_TLS_GD_PC_HI20:
-    case R_LARCH_TLS_LD_HI20:
+    case R_LARCH_TLS_LD_PC_HI20:
     case R_LARCH_TLS_GD_HI20:
+    case R_LARCH_TLS_LD_HI20:
       sym.flags |= NEEDS_TLSGD;
       break;
     case R_LARCH_32_PCREL:
@@ -610,8 +804,14 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_LARCH_TLS_LE_LO12:
     case R_LARCH_TLS_LE64_LO20:
     case R_LARCH_TLS_LE64_HI12:
+    case R_LARCH_TLS_LE_HI20_R:
+    case R_LARCH_TLS_LE_LO12_R:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_LARCH_TLS_DESC_CALL:
+      scan_tlsdesc(ctx, sym);
+      break;
+    case R_LARCH_64:
     case R_LARCH_B16:
     case R_LARCH_B21:
     case R_LARCH_ABS_HI20:
@@ -645,6 +845,10 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_LARCH_SUB64:
     case R_LARCH_ADD_ULEB128:
     case R_LARCH_SUB_ULEB128:
+    case R_LARCH_TLS_DESC_PC_HI20:
+    case R_LARCH_TLS_DESC_PC_LO12:
+    case R_LARCH_TLS_DESC_LD:
+    case R_LARCH_TLS_LE_ADD_R:
       break;
     default:
       Error(ctx) << *this << ": unknown relocation: " << rel;
@@ -653,29 +857,157 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
 }
 
 template <>
-void Thunk<E>::copy_buf(Context<E> &ctx) {
-  static const ul32 insn[] = {
-    0x1e00'000c, // pcaddu18i $t0, 0
-    0x4c00'0180, // jirl      $zero, $t0, 0
-  };
+void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
+  std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
+  isec.extra.r_deltas.resize(rels.size() + 1);
+  i64 delta = 0;
 
-  static_assert(E::thunk_size == sizeof(insn));
-
-  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
-  u64 P = output_section.shdr.sh_addr + offset;
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &r = rels[i];
+    Symbol<E> &sym = *isec.file.symbols[r.r_sym];
+    isec.extra.r_deltas[i] = delta;
+
+    // A R_LARCH_ALIGN relocation refers to the beginning of a nop
+    // sequence. We need to remove some or all of them so that the
+    // instruction that immediately follows that is aligned to a specified
+    // boundary. To allow that, a R_LARCH_ALIGN relocation that requests
+    // 2^n alignment refers to 2^n - 4 bytes of nop instructions.
+    if (r.r_type == R_LARCH_ALIGN) {
+      // The actual rule for storing the alignment size is a bit weird.
+      // In particular, the most significant 56 bits of r_addend is
+      // sometimes used to store the upper limit of the alignment,
+      // allowing the instruction that follows nops _not_ to be aligned at
+      // all. I think that's a spec bug, so we don't want to support that.
+      i64 alignment;
+      if (r.r_sym) {
+        if (r.r_addend >> 8)
+          Fatal(ctx) << isec << ": ternary R_LARCH_ALIGN is not supported: " << i;
+        alignment = 1 << r.r_addend;
+      } else {
+        if (!has_single_bit(r.r_addend + 4))
+          Fatal(ctx) << isec << ": R_LARCH_ALIGN: invalid alignment requirement: "
+                     << i;
+        alignment = r.r_addend + 4;
+      }
+
+      u64 loc = isec.get_addr() + r.r_offset - delta;
+      u64 next_loc = loc + alignment - 4;
+      delta += next_loc - align_to(loc, alignment);
+      continue;
+    }
 
-  for (Symbol<E> *sym : symbols) {
-    u64 S = sym->get_addr(ctx);
+    // Handling other relocations is optional.
+    if (!ctx.arg.relax || i == rels.size() - 1 ||
+        rels[i + 1].r_type != R_LARCH_RELAX)
+      continue;
 
-    memcpy(buf, insn, sizeof(insn));
-    write_j20(buf, (S - P + 0x20000) >> 18);
-    write_k16(buf + 4, (S - P) >> 2);
+    // Skip linker-synthesized symbols because their final addresses
+    // are not fixed yet.
+    if (sym.file == ctx.internal_obj)
+      continue;
 
-    buf += sizeof(insn);
-    P += sizeof(insn);
+    switch (r.r_type) {
+    case R_LARCH_TLS_LE_HI20_R:
+    case R_LARCH_TLS_LE_ADD_R:
+      // LoongArch uses the following three instructions to access
+      // TP ± 2 GiB.
+      //
+      //  lu12i.w $t0, 0           # R_LARCH_TLS_LE_HI20_R
+      //  add.d   $t0, $t0, $tp    # R_LARCH_TLS_LE_ADD_R
+      //  addi.d  $t0, $t0, 0      # R_LARCH_TLS_LE_LO12_R
+      //
+      // If the thread-local variable is within TP ± 2 KiB, we can
+      // relax them into the following single instruction.
+      //
+      //  addi.d  $t0, $tp, <tp-offset>
+      if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
+          sign_extend(val, 11) == val)
+        delta += 4;
+      break;
+    case R_LARCH_PCALA_HI20:
+      // The following two instructions are used to materialize a
+      // PC-relative address with a 32 bit displacement.
+      //
+      //   pcalau12i $t0, 0         # R_LARCH_PCALA_HI20
+      //   addi.d    $t0, $t0, 0    # R_LARCH_PCALA_LO12
+      //
+      // If the displacement is within ±2 MiB, we can relax them to
+      // the following instruction.
+      //
+      //   pcaddi    $t0, <offset>
+      if (i + 3 < rels.size() &&
+          rels[i + 2].r_type == R_LARCH_PCALA_LO12 &&
+          rels[i + 2].r_offset == rels[i].r_offset + 4 &&
+          rels[i + 3].r_type == R_LARCH_RELAX) {
+        i64 dist = compute_distance(ctx, sym, isec, r);
+        u32 insn1 = *(ul32 *)(isec.contents.data() + rels[i].r_offset);
+        u32 insn2 = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4);
+        bool is_addi_d = (insn2 & 0xffc0'0000) == 0x02c0'0000;
+
+        if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21) &&
+            is_addi_d && get_rd(insn1) == get_rd(insn2) &&
+            get_rd(insn2) == get_rj(insn2))
+          delta += 4;
+      }
+      break;
+    case R_LARCH_CALL36:
+      // A CALL36 relocation referes to the following instruction pair
+      // to jump to PC ± 128 GiB.
+      //
+      //   pcaddu18i $t0,       0         # R_LARCH_CALL36
+      //   jirl      $zero/$ra, $t0, 0
+      //
+      // If the displacement is PC ± 128 MiB, we can use B or BL instead.
+      // Note that $zero is $r0 and $ra is $r1.
+      if (i64 dist = compute_distance(ctx, sym, isec, r);
+          -(1 << 27) <= dist && dist < (1 << 27))
+        if (u32 jirl = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4);
+            get_rd(jirl) == 0 || get_rd(jirl) == 1)
+          delta += 4;
+      break;
+    case R_LARCH_GOT_PC_HI20:
+      // The following two instructions are used to load a symbol address
+      // from the GOT.
+      //
+      //   pcalau12i $t0, 0         # R_LARCH_GOT_PC_HI20
+      //   ld.d      $t0, $t0, 0    # R_LARCH_GOT_PC_LO12
+      //
+      // If the PC-relative symbol address is known at link-time, we can
+      // relax them to the following instruction.
+      //
+      //   pcaddi    $t0, <offset>
+      if (is_relaxable_got_load(ctx, isec, i)) {
+        i64 dist = compute_distance(ctx, sym, isec, r);
+        if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21))
+          delta += 4;
+      }
+      break;
+    case R_LARCH_TLS_DESC_PC_HI20:
+      if (sym.has_tlsdesc(ctx)) {
+        u64 P = isec.get_addr() + r.r_offset;
+        i64 dist = sym.get_tlsdesc_addr(ctx) + r.r_addend - P;
+        if (-(1 << 21) <= dist && dist < (1 << 21))
+          delta += 4;
+      } else {
+        delta += 4;
+      }
+      break;
+    case R_LARCH_TLS_DESC_PC_LO12:
+      if (!sym.has_tlsdesc(ctx))
+        delta += 4;
+      break;
+    case R_LARCH_TLS_DESC_LD:
+      if (!sym.has_tlsdesc(ctx) && !sym.has_gottp(ctx) &&
+          sym.get_addr(ctx) + r.r_addend - ctx.tp_addr < 0x1000)
+        delta += 4;
+      break;
+    }
   }
+
+  isec.extra.r_deltas[rels.size()] = delta;
+  isec.sh_size -= delta;
 }
 
-} // namespace mold::elf
+} // namespace mold
 
 #endif
diff --git a/elf/arch-m68k.cc b/src/arch-m68k.cc
similarity index 95%
rename from elf/arch-m68k.cc
rename to src/arch-m68k.cc
index f9de3be0..edffe048 100644
--- a/elf/arch-m68k.cc
+++ b/src/arch-m68k.cc
@@ -16,7 +16,7 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = M68K;
 
@@ -78,11 +78,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -126,7 +121,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
     switch (rel.r_type) {
     case R_68K_32:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_68K_16:
       write16(S + A);
@@ -251,8 +245,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   for (i64 i = 0; i < rels.size(); i++) {
@@ -266,9 +258,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       Error(ctx) << sym << ": GNU ifunc symbol is not supported on m68k";
 
     switch (rel.r_type) {
-    case R_68K_32:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_68K_16:
     case R_68K_8:
       scan_absrel(ctx, sym, rel);
@@ -312,6 +301,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_68K_TLS_LE8:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_68K_32:
     case R_68K_TLS_LDO32:
     case R_68K_TLS_LDO16:
     case R_68K_TLS_LDO8:
@@ -322,4 +312,4 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
   }
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-ppc32.cc b/src/arch-ppc32.cc
similarity index 96%
rename from elf/arch-ppc32.cc
rename to src/arch-ppc32.cc
index 3bc0db6c..4525e73d 100644
--- a/elf/arch-ppc32.cc
+++ b/src/arch-ppc32.cc
@@ -42,7 +42,7 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = PPC32;
 
@@ -54,7 +54,7 @@ static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
 
 template <>
 void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ub32 insn[] = {
+  constexpr ub32 insn[] = {
     // Get the address of this PLT section
     0x7c08'02a6, //    mflr    r0
     0x429f'0005, //    bcl     20, 31, 4
@@ -88,7 +88,7 @@ void write_plt_header(Context<E> &ctx, u8 *buf) {
   loc[5] |= lo(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
 }
 
-static const ub32 plt_entry[] = {
+constexpr ub32 plt_entry[] = {
   // Get the address of this PLT entry
   0x7c08'02a6, // mflr    r0
   0x429f'0005, // bcl     20, 31, 4
@@ -148,11 +148,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   u64 GOT2 = file.extra.got2 ? file.extra.got2->get_addr() : 0;
 
   for (i64 i = 0; i < rels.size(); i++) {
@@ -170,10 +165,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     u64 GOT = ctx.got->shdr.sh_addr;
 
     switch (rel.r_type) {
-    case R_PPC_ADDR32:
-    case R_PPC_UADDR32:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
-      break;
     case R_PPC_ADDR14:
       *(ub32 *)loc |= bits(S + A, 15, 2) << 2;
       break;
@@ -275,6 +266,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_PPC_GOT_TPREL16:
       *(ub16 *)loc = sym.get_gottp_addr(ctx) - GOT;
       break;
+    case R_PPC_ADDR32:
+    case R_PPC_UADDR32:
     case R_PPC_TLS:
     case R_PPC_TLSGD:
     case R_PPC_TLSLD:
@@ -323,8 +316,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -339,10 +330,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       sym.flags |= NEEDS_GOT | NEEDS_PLT;
 
     switch (rel.r_type) {
-    case R_PPC_ADDR32:
-    case R_PPC_UADDR32:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_PPC_ADDR14:
     case R_PPC_ADDR16:
     case R_PPC_UADDR16:
@@ -391,6 +378,8 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_PPC_TPREL16_HA:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_PPC_ADDR32:
+    case R_PPC_UADDR32:
     case R_PPC_LOCAL24PC:
     case R_PPC_TLS:
     case R_PPC_TLSGD:
@@ -409,7 +398,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
 
 template <>
 void Thunk<E>::copy_buf(Context<E> &ctx) {
-  static const ub32 local_thunk[] = {
+  constexpr ub32 local_thunk[] = {
     // Get this thunk's address
     0x7c08'02a6, // mflr    r0
     0x429f'0005, // bcl     20, 31, 4
@@ -450,4 +439,4 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
   }
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-ppc64v1.cc b/src/arch-ppc64v1.cc
similarity index 96%
rename from elf/arch-ppc64v1.cc
rename to src/arch-ppc64v1.cc
index cef71954..e3ec1c55 100644
--- a/elf/arch-ppc64v1.cc
+++ b/src/arch-ppc64v1.cc
@@ -50,7 +50,7 @@
 #include <algorithm>
 #include <tbb/parallel_for_each.h>
 
-namespace mold::elf {
+namespace mold {
 
 using E = PPC64V1;
 
@@ -68,7 +68,7 @@ static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
 // resolved addresses.
 template <>
 void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ub32 insn[] = {
+  constexpr ub32 insn[] = {
     0x7d88'02a6, // mflr    r12
     0x429f'0005, // bcl     20, 31, 4 // obtain PC
     0x7d68'02a6, // mflr    r11
@@ -101,7 +101,7 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
   // call to the PLT entry jumps to. So we need to strictly follow the PLT
   // section layout as the loader expect it to be.
   if (idx < 0x8000) {
-    static const ub32 insn[] = {
+    constexpr ub32 insn[] = {
       0x3800'0000, // li      r0, PLT_INDEX
       0x4b00'0000, // b       plt0
     };
@@ -110,7 +110,7 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
     loc[0] |= idx;
     loc[1] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 4) & 0x00ff'ffff;
   } else {
-    static const ub32 insn[] = {
+    constexpr ub32 insn[] = {
       0x3c00'0000, // lis     r0, PLT_INDEX@high
       0x6000'0000, // ori     r0, r0, PLT_INDEX@lo
       0x4b00'0000, // b       plt0
@@ -154,11 +154,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -182,11 +177,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     u64 TOC = ctx.extra.TOC->value;
 
     switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-      apply_toc_rel(ctx, sym, rel, loc, S, A, P, &dynrel);
-      break;
     case R_PPC64_TOC:
-      apply_toc_rel(ctx, *ctx.extra.TOC, rel, loc, TOC, A, P, &dynrel);
       break;
     case R_PPC64_TOC16_HA:
       *(ub16 *)loc = ha(S + A - TOC);
@@ -277,6 +268,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_PPC64_GOT_TPREL16_LO_DS:
       *(ub16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc;
       break;
+    case R_PPC64_ADDR64:
     case R_PPC64_PLTSEQ:
     case R_PPC64_PLTCALL:
     case R_PPC64_TLS:
@@ -341,8 +333,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -362,10 +352,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       sym.flags |= NEEDS_PPC_OPD;
 
     switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-    case R_PPC64_TOC:
-      scan_toc_rel(ctx, sym, rel);
-      break;
     case R_PPC64_GOT_TPREL16_HA:
       sym.flags |= NEEDS_GOTTP;
       break;
@@ -387,6 +373,8 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_PPC64_TPREL16_LO_DS:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_PPC64_ADDR64:
+    case R_PPC64_TOC:
     case R_PPC64_REL32:
     case R_PPC64_REL64:
     case R_PPC64_TOC16_HA:
@@ -421,7 +409,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
   // If the destination is .plt.got, we save the current r2, read an
   // address of a function descriptor from .got, restore %r2 and jump
   // to the function.
-  static const ub32 pltgot_thunk[] = {
+  constexpr ub32 pltgot_thunk[] = {
     // Store the caller's %r2
     0xf841'0028, // std   %r2, 40(%r1)
 
@@ -439,7 +427,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
   };
 
   // If the destination is .plt, read a function descriptor from .got.plt.
-  static const ub32 plt_thunk[] = {
+  constexpr ub32 plt_thunk[] = {
     // Store the caller's %r2
     0xf841'0028, // std   %r2, 40(%r1)
 
@@ -458,7 +446,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
 
   // If the destination is a non-imported function, we directly jump
   // to the function entry address.
-  static const ub32 local_thunk[] = {
+  constexpr ub32 local_thunk[] = {
     0x3d82'0000, // addis r12, r2,  foo@toc@ha
     0x398c'0000, // addi  r12, r12, foo@toc@lo
     0x7d89'03a6, // mtctr r12
@@ -689,4 +677,4 @@ void PPC64OpdSection::copy_buf(Context<E> &ctx) {
   }
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-ppc64v2.cc b/src/arch-ppc64v2.cc
similarity index 96%
rename from elf/arch-ppc64v2.cc
rename to src/arch-ppc64v2.cc
index 15f855af..78456fdb 100644
--- a/elf/arch-ppc64v2.cc
+++ b/src/arch-ppc64v2.cc
@@ -82,7 +82,7 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = PPC64V2;
 
@@ -106,7 +106,7 @@ static void write34(u8 *loc, u64 x) {
 // resolved addresses.
 template <>
 void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn[] = {
+  constexpr ul32 insn[] = {
     // Get PC
     0x7c08'02a6, // mflr    r0
     0x429f'0005, // bcl     20, 31, 4 // obtain PC
@@ -186,11 +186,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -210,12 +205,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     auto no_r2save_thunk_addr = [&] { return get_thunk_addr(i) + 8; };
 
     switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-      if (name() == ".toc")
-        apply_toc_rel(ctx, sym, rel, loc, S, A, P, &dynrel);
-      else
-        apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
-      break;
     case R_PPC64_TOC16_HA:
       *(ul16 *)loc = ha(S + A - TOC);
       break;
@@ -337,6 +326,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_PPC64_TPREL34:
       write34(loc, S + A - ctx.tp_addr);
       break;
+    case R_PPC64_ADDR64:
     case R_PPC64_PLTSEQ:
     case R_PPC64_PLTSEQ_NOTOC:
     case R_PPC64_PLTCALL:
@@ -403,8 +393,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -419,12 +407,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       sym.flags |= NEEDS_GOT | NEEDS_PLT;
 
     switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-      if (name() == ".toc")
-        scan_toc_rel(ctx, sym, rel);
-      else
-        scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_PPC64_GOT_TPREL16_HA:
     case R_PPC64_GOT_TPREL_PCREL34:
       sym.flags |= NEEDS_GOTTP;
@@ -458,6 +440,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_PPC64_TPREL34:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_PPC64_ADDR64:
     case R_PPC64_REL32:
     case R_PPC64_REL64:
     case R_PPC64_TOC16_HA:
@@ -495,7 +478,7 @@ template <>
 void Thunk<E>::copy_buf(Context<E> &ctx) {
   // If the destination is PLT, we read an address from .got.plt or .got
   // and jump there.
-  static const ul32 plt_thunk[] = {
+  constexpr ul32 plt_thunk[] = {
     0xf841'0018, // std   r2, 24(r1)
     0x6000'0000, // nop
     0x3d82'0000, // addis r12, r2, foo@gotplt@toc@ha
@@ -504,7 +487,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
     0x4e80'0420, // bctr
   };
 
-  static const ul32 plt_thunk_power10[] = {
+  constexpr ul32 plt_thunk_power10[] = {
     0xf841'0018, // std   r2, 24(r1)
     0x6000'0000, // nop
     0x0410'0000, // pld   r12, foo@gotplt@pcrel
@@ -515,7 +498,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
 
   // If the destination is a non-imported function, we directly jump
   // to its local entry point.
-  static const ul32 local_thunk[] = {
+  constexpr ul32 local_thunk[] = {
     0xf841'0018, // std   r2, 24(r1)
     0x6000'0000, // nop
     0x3d82'0000, // addis r12, r2,  foo@toc@ha
@@ -524,7 +507,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
     0x4e80'0420, // bctr
   };
 
-  static const ul32 local_thunk_power10[] = {
+  constexpr ul32 local_thunk_power10[] = {
     0xf841'0018, // std   r2, 24(r1)
     0x6000'0000, // nop
     0x0610'0000, // pla   r12, foo@pcrel
@@ -677,4 +660,4 @@ u64 get_eflags(Context<E> &ctx) {
   return 2;
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-riscv.cc b/src/arch-riscv.cc
similarity index 83%
rename from elf/arch-riscv.cc
rename to src/arch-riscv.cc
index 7e7f618b..fd600b61 100644
--- a/elf/arch-riscv.cc
+++ b/src/arch-riscv.cc
@@ -12,71 +12,20 @@
 // From the linker's point of view, the RISC-V's psABI is unique because
 // sections in input object files can be shrunk while being copied to the
 // output file. That is contrary to other psABIs in which sections are an
-// atomic unit of copying. Let me explain it in more details.
-//
-// Since RISC-V instructions are 16-bit or 32-bit long, there's no way to
-// embed a very large immediate into a branch instruction. In fact, JAL
-// (jump and link) instruction can jump to only within PC ± 1 MiB because
-// its immediate is only 21 bits long. If the destination is out of its
-// reach, we need to use two instructions instead; the first instruction
-// being AUIPC which sets upper 20 bits to a register and the second being
-// JALR with a 12-bit immediate and the register. Combined, they specify a
-// 32 bits displacement.
-//
-// Other RISC ISAs have the same limitation, and they solved the problem by
-// letting the linker create so-called "range extension thunks". It works as
-// follows: the compiler optimistically emits single jump instructions for
-// function calls. If the linker finds that a branch target is out of reach,
-// it emits a small piece of machine code near the branch instruction and
-// redirect the branch to the linker-synthesized code. The code constructs a
-// full 32-bit address in a register and jump to the destination. That
-// linker-synthesized code is called "range extension thunks" or just
-// "thunks".
-//
-// The RISC-V psABI is unique that it works the other way around. That is,
-// for RISC-V, the compiler always emits two instructions (AUIPC + JAL) for
-// function calls. If the linker finds the destination is reachable with a
-// single instruction, it replaces the two instructions with the one and
-// shrink the section size by one instruction length, instead of filling the
-// gap with a nop.
-//
-// With the presence of this relaxation, sections can no longer be
-// considered as an atomic unit. If we delete 4 bytes from the middle of a
-// section, all contents after that point needs to be shifted by 4. Symbol
-// values and relocation offsets have to be adjusted accordingly if they
-// refer to past the deleted bytes.
-//
-// In mold, we use `r_deltas` to memorize how many bytes have be adjusted
-// for relocations. For symbols, we directly mutate their `value` member.
-//
-// RISC-V object files tend to have way more relocations than those for
-// other targets. This is because all branches, including ones that jump
-// within the same section, are explicitly expressed with relocations.
-// Here is why we need them: all control-flow statements such as `if` or
-// `for` are implemented using branch instructions. For other targets, the
-// compiler doesn't emit relocations for such branches because they know
-// at compile-time exactly how many bytes has to be skipped. That's not
-// true to RISC-V because the linker may delete bytes between a branch and
-// its destination. Therefore, all branches including in-section ones have
-// to be explicitly expressed with relocations.
-//
-// Note that this mechanism only shrink sections and never enlarge, as
-// the compiler always emits the longest instruction sequence. This
-// makes the linker implementation a bit simpler because we don't need
-// to worry about oscillation.
+// atomic unit of copying. See file comments in shrink-sections.cc for
+// details.
 //
 // https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc
 
 #if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE
 
-#include "elf.h"
 #include "mold.h"
 
 #include <regex>
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_for_each.h>
 
-namespace mold::elf {
+namespace mold {
 
 using E = MOLD_TARGET;
 
@@ -141,7 +90,7 @@ static void set_rs1(u8 *loc, u32 rs1) {
 
 template <>
 void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn_64[] = {
+  constexpr ul32 insn_64[] = {
     0x0000'0397, // auipc  t2, %pcrel_hi(.got.plt)
     0x41c3'0333, // sub    t1, t1, t3               # .plt entry + hdr + 12
     0x0003'be03, // ld     t3, %pcrel_lo(1b)(t2)    # _dl_runtime_resolve
@@ -152,7 +101,7 @@ void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
     0x000e'0067, // jr     t3
   };
 
-  static const ul32 insn_32[] = {
+  constexpr ul32 insn_32[] = {
     0x0000'0397, // auipc  t2, %pcrel_hi(.got.plt)
     0x41c3'0333, // sub    t1, t1, t3               # .plt entry + hdr + 12
     0x0003'ae03, // lw     t3, %pcrel_lo(1b)(t2)    # _dl_runtime_resolve
@@ -172,14 +121,14 @@ void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
   write_itype(buf + 16, gotplt - plt);
 }
 
-static const ul32 plt_entry_64[] = {
+constexpr ul32 plt_entry_64[] = {
   0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
   0x000e'3e03, // ld      t3, %pcrel_lo(1b)(t3)
   0x000e'0367, // jalr    t1, t3
   0x0010'0073, // ebreak
 };
 
-static const ul32 plt_entry_32[] = {
+constexpr ul32 plt_entry_32[] = {
   0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
   0x000e'2e03, // lw      t3, %pcrel_lo(1b)(t3)
   0x000e'0367, // jalr    t1, t3
@@ -261,11 +210,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
   u64 GP = ctx.__global_pointer ? ctx.__global_pointer->get_addr(ctx) : 0;
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   auto get_r_delta = [&](i64 idx) {
     return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
   };
@@ -316,12 +260,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_RISCV_32:
       if constexpr (E::is_64)
         *(U32<E> *)loc = S + A;
-      else
-        apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_RISCV_64:
-      assert(E::is_64);
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_RISCV_BRANCH:
       check(S + A - P, -(1 << 12), 1 << 12);
@@ -764,38 +704,9 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
   }
 }
 
-template <>
-void InputSection<E>::copy_contents_riscv(Context<E> &ctx, u8 *buf) {
-  // If a section is not relaxed, we can copy it as a one big chunk.
-  if (extra.r_deltas.empty()) {
-    copy_contents(ctx, buf);
-    return;
-  }
-
-  // A relaxed section is copied piece-wise.
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-  i64 pos = 0;
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i];
-    if (delta == 0)
-      continue;
-    assert(delta > 0);
-
-    const ElfRel<E> &r = rels[i];
-    memcpy(buf, contents.data() + pos, r.r_offset - pos);
-    buf += r.r_offset - pos;
-    pos = r.r_offset + delta;
-  }
-
-  memcpy(buf, contents.data() + pos, contents.size() - pos);
-}
-
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -813,17 +724,10 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_RISCV_32:
       if constexpr (E::is_64)
         scan_absrel(ctx, sym, rel);
-      else
-        scan_dyn_absrel(ctx, sym, rel);
       break;
     case R_RISCV_HI20:
       scan_absrel(ctx, sym, rel);
       break;
-    case R_RISCV_64:
-      if constexpr (!E::is_64)
-        Error(ctx) << *this << ": R_RISCV_64 cannot be used on RV32";
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_RISCV_CALL:
     case R_RISCV_CALL_PLT:
     case R_RISCV_PLT32:
@@ -856,6 +760,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       if (ctx.arg.shared)
         Error(ctx) << *this << ": R_RISCV_GPREL_HI20 may not be used with -shared";
       break;
+    case R_RISCV_64:
     case R_RISCV_BRANCH:
     case R_RISCV_JAL:
     case R_RISCV_PCREL_LO12_I:
@@ -918,34 +823,9 @@ u64 get_eflags(Context<E> &ctx) {
   return ret;
 }
 
-static bool is_resizable(InputSection<E> *isec) {
-  return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) &&
-         (isec->shdr().sh_flags & SHF_EXECINSTR);
-}
-
-// Returns the distance between a relocated place and a symbol.
-static i64 compute_distance(Context<E> &ctx, Symbol<E> &sym,
-                            InputSection<E> &isec, const ElfRel<E> &rel) {
-  // We handle absolute symbols as if they were infinitely far away
-  // because `shrink_section` may increase a distance between a branch
-  // instruction and an absolute symbol. Branching to an absolute
-  // location is extremely rare in real code, though.
-  if (sym.is_absolute())
-    return INT32_MAX;
-
-  // Likewise, relocations against weak undefined symbols won't be relaxed.
-  if (sym.esym().is_undef_weak())
-    return INT32_MAX;
-
-  // Compute a distance between the relocated place and the symbol.
-  i64 S = sym.get_addr(ctx);
-  i64 A = rel.r_addend;
-  i64 P = isec.get_addr() + rel.r_offset;
-  return S + A - P;
-}
-
-// Scan relocations to shrink sections.
-static void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
+// Scan relocations to a given shrink section.
+template <>
+void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
   std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
   isec.extra.r_deltas.resize(rels.size() + 1);
 
@@ -1126,55 +1006,6 @@ static void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc)
   isec.sh_size -= delta;
 }
 
-// Shrink sections by interpreting relocations.
-//
-// This operation seems to be optional, because by default longest
-// instructions are being used. However, calling this function is actually
-// mandatory because of R_RISCV_ALIGN. R_RISCV_ALIGN is a directive to the
-// linker to align the location referred to by the relocation to a
-// specified byte boundary. We at least have to interpret them to satisfy
-// the alignment constraints.
-template <>
-i64 riscv_resize_sections<E>(Context<E> &ctx) {
-  Timer t(ctx, "riscv_resize_sections");
-
-  // True if we can use the 2-byte instructions. This is usually true on
-  // Unix because RV64GC is generally considered the baseline hardware.
-  bool use_rvc = get_eflags(ctx) & EF_RISCV_RVC;
-
-  // Find all the relocations that can be relaxed.
-  // This step should only shrink sections.
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    for (std::unique_ptr<InputSection<E>> &isec : file->sections)
-      if (is_resizable(isec.get()))
-        shrink_section(ctx, *isec, use_rvc);
-  });
-
-  // Fix symbol values.
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    for (Symbol<E> *sym : file->symbols) {
-      if (sym->file != file)
-        continue;
-
-      InputSection<E> *isec = sym->get_input_section();
-      if (!isec || isec->extra.r_deltas.empty())
-        continue;
-
-      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
-      auto it = std::lower_bound(rels.begin(), rels.end(), sym->value,
-                                 [&](const ElfRel<E> &r, u64 val) {
-        return r.r_offset < val;
-      });
-
-      sym->value -= isec->extra.r_deltas[it - rels.begin()];
-    }
-  });
-
-  // Re-compute section offset again to finalize them.
-  compute_section_sizes(ctx);
-  return set_osec_offsets(ctx);
-}
-
 // ISA name handlers
 //
 // An example of ISA name is "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0".
@@ -1190,8 +1021,8 @@ i64 riscv_resize_sections<E>(Context<E> &ctx) {
 // Each extension consists of a name, a major version and a minor version.
 // For example, "m2p0" indicates the "m" extension of version 2.0. "p" is
 // just a separator. Versions are often omitted in documents, but they are
-// mandatory in .riscv.attributes. Likewise, abbreviations as "g" (which
-// is short for "IMAFD") are not allowed in .riscv.attributes.
+// mandatory in .riscv.attributes. Likewise, abbreviations such as "G"
+// (which is short for "IMAFD") are not allowed in .riscv.attributes.
 //
 // Each RISC-V object file contains an ISA string enumerating extensions
 // used by the object file. We need to merge input objects' ISA strings
@@ -1384,6 +1215,6 @@ void RiscvAttributesSection<E>::copy_buf(Context<E> &ctx) {
   write_vector(ctx.buf + this->shdr.sh_offset, contents);
 }
 
-} // namespace mold::elf
+} // namespace mold
 
 #endif
diff --git a/elf/arch-s390x.cc b/src/arch-s390x.cc
similarity index 94%
rename from elf/arch-s390x.cc
rename to src/arch-s390x.cc
index 5fe7539d..dedc607c 100644
--- a/elf/arch-s390x.cc
+++ b/src/arch-s390x.cc
@@ -37,7 +37,7 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = S390X;
 
@@ -116,11 +116,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -153,7 +148,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
     switch (rel.r_type) {
     case R_390_64:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_390_8:
       check(S + A, 0, 1 << 8);
@@ -256,7 +250,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       *(ub32 *)loc = (GOT + A - P) >> 1;
       break;
     case R_390_GOTENT:
-      check(GOT + G + A - P, -(1LL << 32), 1LL << 32);
+      check_dbl(GOT + G + A - P, -(1LL << 32), 1LL << 32);
       *(ub32 *)loc = (GOT + G + A - P) >> 1;
       break;
     case R_390_TLS_LE32:
@@ -303,22 +297,14 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_390_TLS_LDM32:
       if (ctx.got->has_tlsld(ctx))
         *(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
+      else
+        *(ub32 *)loc = ctx.dtp_addr - ctx.tp_addr;
       break;
     case R_390_TLS_LDM64:
       if (ctx.got->has_tlsld(ctx))
         *(ub64 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
-      break;
-    case R_390_TLS_LDO32:
-      if (ctx.got->has_tlsld(ctx))
-        *(ub32 *)loc = S + A - ctx.dtp_addr;
-      else
-        *(ub32 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_390_TLS_LDO64:
-      if (ctx.got->has_tlsld(ctx))
-        *(ub64 *)loc = S + A - ctx.dtp_addr;
       else
-        *(ub64 *)loc = S + A - ctx.tp_addr;
+        *(ub64 *)loc = ctx.dtp_addr - ctx.tp_addr;
       break;
     case R_390_TLS_LDCALL:
       if (!ctx.got->has_tlsld(ctx)) {
@@ -327,6 +313,12 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
         memcpy(loc, insn, sizeof(insn));
       }
       break;
+    case R_390_TLS_LDO32:
+      *(ub32 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_390_TLS_LDO64:
+      *(ub64 *)loc = S + A - ctx.dtp_addr;
+      break;
     default:
       unreachable();
     }
@@ -385,8 +377,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -401,9 +391,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       sym.flags |= NEEDS_GOT | NEEDS_PLT;
 
     switch (rel.r_type) {
-    case R_390_64:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_390_8:
     case R_390_12:
     case R_390_16:
@@ -457,8 +444,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       // We always want to relax calls to __tls_get_offset() in statically-
       // linked executables because __tls_get_offset() in libc.a just calls
       // abort().
-      if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) ||
-          ctx.arg.is_static) {
+      if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) {
         // Do nothing
       } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) {
         sym.flags |= NEEDS_GOTTP;
@@ -468,7 +454,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       break;
     case R_390_TLS_LDM32:
     case R_390_TLS_LDM64:
-      if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared)) {
+      if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) {
         // Do nothing
       } else {
         ctx.needs_tlsld = true;
@@ -478,6 +464,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_390_TLS_LE64:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_390_64:
     case R_390_TLS_LDO32:
     case R_390_TLS_LDO64:
     case R_390_TLS_GDCALL:
@@ -489,4 +476,4 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
   }
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-sh4.cc b/src/arch-sh4.cc
similarity index 96%
rename from elf/arch-sh4.cc
rename to src/arch-sh4.cc
index bf307048..8e5d336a 100644
--- a/elf/arch-sh4.cc
+++ b/src/arch-sh4.cc
@@ -60,7 +60,7 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = SH4;
 
@@ -230,11 +230,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -251,7 +246,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
     switch (rel.r_type) {
     case R_SH_DIR32:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_SH_REL32:
     case R_SH_PLT32:
@@ -323,8 +317,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   for (i64 i = 0; i < rels.size(); i++) {
@@ -338,9 +330,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       Error(ctx) << sym << ": GNU ifunc symbol is not supported on sh4";
 
     switch (rel.r_type) {
-    case R_SH_DIR32:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_SH_REL32:
       scan_pcrel(ctx, sym, rel);
       break;
@@ -363,6 +352,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_SH_TLS_LE_32:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_SH_DIR32:
     case R_SH_GOTPC:
     case R_SH_GOTOFF:
     case R_SH_TLS_LDO_32:
@@ -373,4 +363,4 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
   }
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-sparc64.cc b/src/arch-sparc64.cc
similarity index 84%
rename from elf/arch-sparc64.cc
rename to src/arch-sparc64.cc
index bebbe11d..b04bb301 100644
--- a/elf/arch-sparc64.cc
+++ b/src/arch-sparc64.cc
@@ -58,7 +58,7 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+namespace mold {
 
 using E = SPARC64;
 
@@ -142,11 +142,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -169,9 +164,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     u64 GOT = ctx.got->shdr.sh_addr;
 
     switch (rel.r_type) {
-    case R_SPARC_64:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
-      break;
     case R_SPARC_5:
       check(S + A, 0, 1 << 5);
       *(ub32 *)loc |= bits(S + A, 4, 0);
@@ -359,27 +351,75 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       *(ub32 *)loc |= bits(S + A, 11, 0);
       break;
     case R_SPARC_TLS_GD_HI22:
-      *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 31, 10);
+      if (sym.has_tlsgd(ctx)) {
+        *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 31, 10);
+      } else if (sym.has_gottp(ctx)) {
+        *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 31, 10);
+      } else {
+        *(ub32 *)loc |= bits(~(S + A - ctx.tp_addr), 31, 10);
+      }
       break;
     case R_SPARC_TLS_GD_LO10:
-      *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 9, 0);
+      if (sym.has_tlsgd(ctx)) {
+        *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 9, 0);
+      } else if (sym.has_gottp(ctx)) {
+        u32 rd = bits(*(ub32 *)loc, 29, 25);
+        *(ub32 *)loc = 0x8010'2000 | (rd << 25) | (rd << 14); // or  %reg, $0, %reg
+        *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 9, 0);
+      } else {
+        u32 rd = bits(*(ub32 *)loc, 29, 25);
+        *(ub32 *)loc = 0x8018'2000 | (rd << 25) | (rd << 14); // xor %reg, $0, %reg
+        *(ub32 *)loc |= bits(S + A - ctx.tp_addr, 9, 0) | 0b1'1100'0000'0000;
+      }
+      break;
+    case R_SPARC_TLS_GD_ADD:
+      if (sym.has_tlsgd(ctx)) {
+        // do nothing
+      } else if (sym.has_gottp(ctx)) {
+        u32 rs2 = bits(*(ub32 *)loc, 4, 0);
+        *(ub32 *)loc = 0xd05d'c000 | rs2; // ldx [ %l7 + %reg ], %o0
+      } else {
+        u32 rs2 = bits(*(ub32 *)loc, 4, 0);
+        *(ub32 *)loc = 0x9001'c000 | rs2; // add %g7, %reg, %o0
+      }
       break;
     case R_SPARC_TLS_GD_CALL:
-    case R_SPARC_TLS_LDM_CALL: {
-      u64 addr;
-      if (ctx.arg.is_static)
-        addr = ctx.extra.tls_get_addr_sec->shdr.sh_addr;
-      else
-        addr = ctx.extra.tls_get_addr_sym->get_addr(ctx);
-
-      *(ub32 *)loc |= bits(addr + A - P, 31, 2);
+      if (sym.has_tlsgd(ctx)) {
+        u64 addr = ctx.extra.tls_get_addr->get_addr(ctx);
+        *(ub32 *)loc |= bits(addr + A - P, 31, 2);
+      } else if (sym.has_gottp(ctx)) {
+        *(ub32 *)loc = 0x9001'c008; // add %g7, %o0, %o0
+      } else {
+        *(ub32 *)loc = 0x0100'0000; // nop
+      }
       break;
-    }
     case R_SPARC_TLS_LDM_HI22:
-      *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 31, 10);
+      if (ctx.got->has_tlsld(ctx))
+        *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 31, 10);
+      else
+        *(ub32 *)loc |= bits(ctx.tp_addr - ctx.tls_begin, 31, 10);
       break;
     case R_SPARC_TLS_LDM_LO10:
-      *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 9, 0);
+      if (ctx.got->has_tlsld(ctx))
+        *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 9, 0);
+      else
+        *(ub32 *)loc |= bits(ctx.tp_addr - ctx.tls_begin, 9, 0);
+      break;
+    case R_SPARC_TLS_LDM_ADD:
+      if (ctx.got->has_tlsld(ctx)) {
+        // do nothing
+      } else {
+        u32 rs2 = bits(*(ub32 *)loc, 4, 0);
+        *(ub32 *)loc = 0x9021'c000 | rs2; // sub %g7, %reg, %o0
+      }
+      break;
+    case R_SPARC_TLS_LDM_CALL:
+      if (ctx.got->has_tlsld(ctx)) {
+        u64 addr = ctx.extra.tls_get_addr->get_addr(ctx);
+        *(ub32 *)loc |= bits(addr + A - P, 31, 2);
+      } else {
+        *(ub32 *)loc = 0x0100'0000; // nop
+      }
       break;
     case R_SPARC_TLS_LDO_HIX22:
       *(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 31, 10);
@@ -402,8 +442,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_SPARC_SIZE32:
       *(ub32 *)loc = sym.esym().st_size + A;
       break;
-    case R_SPARC_TLS_GD_ADD:
-    case R_SPARC_TLS_LDM_ADD:
+    case R_SPARC_64:
     case R_SPARC_TLS_LDO_ADD:
     case R_SPARC_TLS_IE_LD:
     case R_SPARC_TLS_IE_LDX:
@@ -471,8 +510,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -487,9 +524,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       sym.flags |= NEEDS_GOT | NEEDS_PLT;
 
     switch (rel.r_type) {
-    case R_SPARC_64:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_SPARC_8:
     case R_SPARC_5:
     case R_SPARC_6:
@@ -554,24 +588,36 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       scan_pcrel(ctx, sym, rel);
       break;
     case R_SPARC_TLS_GD_HI22:
-      sym.flags |= NEEDS_TLSGD;
+      if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) {
+        // We always relax if -static because libc.a doesn't contain
+        // __tls_get_addr().
+      } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) {
+        sym.flags |= NEEDS_GOTTP;
+      } else {
+        sym.flags |= NEEDS_TLSGD;
+      }
       break;
     case R_SPARC_TLS_LDM_HI22:
-      ctx.needs_tlsld = true;
+      if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) {
+        // We always relax if -static because libc.a doesn't contain
+        // __tls_get_addr().
+      } else {
+        ctx.needs_tlsld = true;
+      }
       break;
     case R_SPARC_TLS_IE_HI22:
       sym.flags |= NEEDS_GOTTP;
       break;
     case R_SPARC_TLS_GD_CALL:
     case R_SPARC_TLS_LDM_CALL:
-      if (!ctx.arg.is_static)
-        if (Symbol<E> &sym = *ctx.extra.tls_get_addr_sym; sym.is_imported)
-          sym.flags |= NEEDS_PLT;
+      if (Symbol<E> *sym = ctx.extra.tls_get_addr; sym->is_imported)
+        sym->flags |= NEEDS_PLT;
       break;
     case R_SPARC_TLS_LE_HIX22:
     case R_SPARC_TLS_LE_LOX10:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_SPARC_64:
     case R_SPARC_GOTDATA_OP_LOX10:
     case R_SPARC_GOTDATA_OP:
     case R_SPARC_GOTDATA_LOX10:
@@ -594,25 +640,4 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
   }
 }
 
-// __tls_get_addr is not defined by libc.a, so we can't use that function
-// in statically-linked executables. This section provides a replacement.
-void SparcTlsGetAddrSection::copy_buf(Context<E> &ctx) {
-  ub32 *buf = (ub32 *)(ctx.buf + this->shdr.sh_offset);
-
-  static const ub32 insn[] = {
-    0x0300'0000, // sethi  %hi(TP_SIZE), %g1
-    0x8210'6000, // or   %g1, %lo(TP_SIZE), %g1
-    0x8221'c001, // sub  %g7, %g1, %g1
-    0xd05a'2008, // ldx  [ %o0 + 8 ], %o0
-    0x81c3'e008, // retl
-    0x9000'4008, // add  %g1, %o0, %o0
-  };
-
-  assert(this->shdr.sh_size == sizeof(insn));
-  memcpy(buf, insn, sizeof(insn));
-
-  buf[0] |= bits(ctx.tp_addr - ctx.tls_begin, 31, 10);
-  buf[1] |= bits(ctx.tp_addr - ctx.tls_begin, 9, 0);
-}
-
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/arch-x86-64.cc b/src/arch-x86-64.cc
similarity index 84%
rename from elf/arch-x86-64.cc
rename to src/arch-x86-64.cc
index 9266b957..4e0b5f93 100644
--- a/elf/arch-x86-64.cc
+++ b/src/arch-x86-64.cc
@@ -28,7 +28,9 @@
 
 #include "mold.h"
 
-namespace mold::elf {
+#include <tbb/parallel_for_each.h>
+
+namespace mold {
 
 using E = X86_64;
 
@@ -299,10 +301,10 @@ static void relax_gd_to_ie(u8 *loc, ElfRel<E> rel, u64 val) {
 }
 
 // Rewrite a function call to __tls_get_addr to a cheaper instruction
-// sequence. The difference from relax_gd_to_le is that we are
-// materializing a Dynamic Thread Pointer for the current ELF module
-// instead of an address for a particular thread-local variable.
-static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 tls_size) {
+// sequence. The difference from relax_gd_to_le is that we are materializing
+// the address of the beginning of TLS block instead of an address of a
+// particular thread-local variable.
+static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, i64 tls_size) {
   switch (rel.r_type) {
   case R_X86_64_PLT32:
   case R_X86_64_PC32: {
@@ -311,10 +313,9 @@ static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 tls_size) {
     //  48 8d 3d 00 00 00 00    lea    foo@tlsld(%rip), %rdi
     //  e8 00 00 00 00          call   __tls_get_addr
     //
-    // The instructions are so short that we cannot rewrite them with
-    // "mov %fs:0, %rax" which is 9 bytes long. We use a shorter code
-    // sequence instead. Since "xor %eax, %eax" zero-clears %rax, the
-    // meaning is equivalent.
+    // Because the original instruction sequence is so short that we need a
+    // little bit of code golfing here. "mov %fs:0, %rax" is 9 byte long, so
+    // xor + mov is shorter. Note that `xor %eax, %eax` zero-clears %eax.
     static const u8 insn[] = {
       0x31, 0xc0,                   // xor %eax, %eax
       0x64, 0x48, 0x8b, 0x00,       // mov %fs:(%rax), %rax
@@ -331,13 +332,12 @@ static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 tls_size) {
     //  48 8d 3d 00 00 00 00    lea    foo@tlsld(%rip), %rdi
     //  ff 15 00 00 00 00       call   *__tls_get_addr@GOT(%rip)
     static const u8 insn[] = {
-      0x31, 0xc0,                   // xor %eax, %eax
+      0x48, 0x31, 0xc0,             // xor %rax, %rax
       0x64, 0x48, 0x8b, 0x00,       // mov %fs:(%rax), %rax
       0x48, 0x2d, 0, 0, 0, 0,       // sub $tls_size, %rax
-      0x90,                         // nop
     };
     memcpy(loc - 3, insn, sizeof(insn));
-    *(ul32 *)(loc + 5) = tls_size;
+    *(ul32 *)(loc + 6) = tls_size;
     break;
   }
   case R_X86_64_PLTOFF64: {
@@ -368,11 +368,6 @@ template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE)
@@ -420,7 +415,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       write32s(S + A);
       break;
     case R_X86_64_64:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
       break;
     case R_X86_64_PC8:
       check(S + A - P, -(1 << 7), 1 << 7);
@@ -536,14 +530,16 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       //   call   *(%rax)
       //       R_X86_64_TLSDESC_CALL       foo
       //
-      // We may relax the instructions to the following for non-dlopen'd DSO
+      // We may relax the instructions to the following if its TP-relative
+      // address is known at link-time
       //
-      //   mov     foo@GOTTPOFF(%rip), %rax
+      //   mov     $foo@TPOFF, %rax
       //   nop
       //
-      // or to the following for executable.
+      // or to the following if the TP-relative address is known at
+      // process startup time.
       //
-      //   mov     $foo@TPOFF, %rax
+      //   mov     foo@GOTTPOFF(%rip), %rax
       //   nop
       //
       // We allow the following alternative code sequence too because
@@ -707,8 +703,6 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
 template <>
 void InputSection<E>::scan_relocations(Context<E> &ctx) {
   assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
   // Scan relocations
@@ -743,9 +737,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_X86_64_32S:
       scan_absrel(ctx, sym, rel);
       break;
-    case R_X86_64_64:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
     case R_X86_64_PC8:
     case R_X86_64_PC16:
     case R_X86_64_PC32:
@@ -768,8 +759,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
         sym.flags |= NEEDS_PLT;
       break;
     case R_X86_64_TLSGD:
-      if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) ||
-          ctx.arg.is_static) {
+      if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) {
         // We always relax if -static because libc.a doesn't contain
         // __tls_get_addr().
         i++;
@@ -783,7 +773,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_X86_64_TLSLD:
       // We always relax if -static because libc.a doesn't contain
       // __tls_get_addr().
-      if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared))
+      if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared))
         i++;
       else
         ctx.needs_tlsld = true;
@@ -803,6 +793,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_X86_64_TPOFF64:
       check_tlsle(ctx, sym, rel);
       break;
+    case R_X86_64_64:
     case R_X86_64_GOTOFF64:
     case R_X86_64_DTPOFF32:
     case R_X86_64_DTPOFF64:
@@ -816,4 +807,95 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
   }
 }
 
-} // namespace mold::elf
+// Intel CET is a relatively new CPU feature to enhance security by
+// protecting control flow integrity. If the feature is enabled, indirect
+// branches (i.e. branch instructions that take a register instead of an
+// immediate) must land on a "landing pad" instruction, or a CPU-level fault
+// will raise. That prevents an attacker to branch to a middle of a random
+// function, making ROP or JOP much harder to conduct.
+//
+// On x86-64, the landing pad instruction is ENDBR64. That is actually a
+// repurposed NOP instruction to provide binary compatibility with older
+// hardware that doesn't support CET.
+//
+// The problem here is that the compiler always emits a landing pad at the
+// beginning fo a global function because it doesn't know whether or not the
+// function's address is taken in other translation units. As a result, the
+// resulting binary contains more landing pads than necessary.
+//
+// This function rewrites a landing pad with a nop if the function's address
+// was not actually taken. We can do what the compiler cannot because we
+// know about all translation units.
+void rewrite_endbr(Context<E> &ctx) {
+  Timer t(ctx, "rewrite_endbr");
+
+  constexpr u8 endbr64[] = {0xf3, 0x0f, 0x1e, 0xfa};
+  constexpr u8 nop[] = {0x0f, 0x1f, 0x40, 0x00};
+
+  // Rewrite all endbr64 instructions referred to by function symbols with
+  // NOPs. We handle only global symbols because the compiler doesn't emit
+  // an endbr64 for a file-scoped function in the first place if its address
+  // is not taken within the file.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (Symbol<E> *sym : file->get_global_syms()) {
+      if (sym->file == file && sym->esym().st_type == STT_FUNC) {
+        if (InputSection<E> *isec = sym->get_input_section();
+            isec && (isec->shdr().sh_flags & SHF_EXECINSTR)) {
+          if (OutputSection<E> *osec = isec->output_section) {
+            u8 *buf = ctx.buf + osec->shdr.sh_offset + isec->offset + sym->value;
+            if (memcmp(buf, endbr64, 4) == 0)
+              memcpy(buf, nop, 4);
+          }
+        }
+      }
+    }
+  });
+
+  auto write_back = [&](InputSection<E> *isec, i64 offset) {
+    // If isec has an endbr64 at a given offset, copy that instruction to
+    // the output buffer, possibly overwriting a nop written in the above
+    // loop.
+    if (isec && isec->output_section &&
+        (isec->shdr().sh_flags & SHF_EXECINSTR) &&
+        0 <= offset && offset <= isec->contents.size() - 4 &&
+        memcmp(isec->contents.data() + offset, endbr64, 4) == 0)
+      memcpy(ctx.buf + isec->output_section->shdr.sh_offset + isec->offset + offset,
+             endbr64, 4);
+  };
+
+  // Write back endbr64 instructions if they are referred to by address-taking
+  // relocations.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
+      if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC)) {
+        for (const ElfRel<E> &rel : isec->get_rels(ctx)) {
+          if (!is_func_call_rel(rel)) {
+            Symbol<E> *sym = file->symbols[rel.r_sym];
+            if (sym->esym().st_type == STT_SECTION)
+              write_back(sym->get_input_section(), rel.r_addend);
+            else
+              write_back(sym->get_input_section(), sym->value);
+          }
+        }
+      }
+    }
+  });
+
+  // We record addresses of some symbols in the ELF header, .dynamic or in
+  // .dynsym. We need to retain endbr64s for such symbols.
+  auto keep = [&](Symbol<E> *sym) {
+    if (sym)
+      write_back(sym->get_input_section(), sym->value);
+  };
+
+  keep(ctx.arg.entry);
+  keep(ctx.arg.init);
+  keep(ctx.arg.fini);
+
+  if (ctx.dynsym)
+    for (Symbol<E> *sym : ctx.dynsym->symbols)
+      if (sym && sym->is_exported)
+        keep(sym);
+}
+
+} // namespace mold
diff --git a/elf/cmdline.cc b/src/cmdline.cc
similarity index 92%
rename from elf/cmdline.cc
rename to src/cmdline.cc
index e053f090..bdb79ed9 100644
--- a/elf/cmdline.cc
+++ b/src/cmdline.cc
@@ -15,7 +15,7 @@
 # define STDERR_FILENO (_fileno(stderr))
 #endif
 
-namespace mold::elf {
+namespace mold {
 
 inline const char helpmsg[] = R"(
 Options:
@@ -44,7 +44,8 @@ inline const char helpmsg[] = R"(
   -f SHLIB, --auxiliary SHLIB Set DT_AUXILIARY to the specified value
   -h LIBNAME, --soname LIBNAME
                               Set shared library name
-  -l LIBNAME                  Search for a given library
+  -l LIBNAME, --library LIBNAME
+                              Search for a given library
   -m TARGET                   Set target
   -o FILE, --output FILE      Set output filename
   -q, --emit-relocs           Leaves relocation sections in the output
@@ -71,7 +72,7 @@ inline const char helpmsg[] = R"(
     --no-apply-dynamic-relocs
   --as-needed                 Only set DT_NEEDED if used
     --no-as-needed
-  --build-id [none,md5,sha1,sha256,uuid,HEXSTRING]
+  --build-id [none,md5,sha1,sha256,fast,uuid,HEXSTRING]
                               Generate build ID
     --no-build-id
   --chroot DIR                Set a given path to the root directory
@@ -85,11 +86,14 @@ inline const char helpmsg[] = R"(
   --defsym=SYMBOL=VALUE       Define a symbol alias
   --demangle                  Demangle C++ symbols in log messages (default)
     --no-demangle
+  --detach                    Create separate debug info file in the background (default)
+    --no-detach
   --enable-new-dtags          Emit DT_RUNPATH for --rpath (default)
     --disable-new-dtags       Emit DT_RPATH for --rpath
   --execute-only              Make executable segments unreadable
   --dp                        Ignored
   --dynamic-list=FILE         Read a list of dynamic symbols (implies -Bsymbolic)
+  --dynamic-list-data         Add data symbols to dynamic symbols
   --eh-frame-hdr              Create .eh_frame_hdr section
     --no-eh-frame-hdr
   --exclude-libs LIB,LIB,..   Mark all symbols in given libraries as hidden
@@ -143,6 +147,8 @@ inline const char helpmsg[] = R"(
   --rpath-link DIR            Ignored
   --run COMMAND ARG...        Run COMMAND with mold as /usr/bin/ld
   --section-start=SECTION=ADDR Set address for section
+  --separate-debug-file[=FILE] Separate debug info to the specified file
+    --no-separate-debug-file
   --shared, --Bshareable      Create a shared library
   --shuffle-sections[=SEED]   Randomize the output by shuffling input sections
   --sort-common               Ignored
@@ -209,13 +215,15 @@ inline const char helpmsg[] = R"(
   -z stack-size=VALUE         Set the size of the stack segment
   -z relro                    Make some sections read-only after relocation (default)
     -z norelro
+  -z rewrite-endbr            Rewrite indirect branch target instructions with NOPs
+    -z norewrite-endbr
   -z rodynamic                Make the .dynamic section read-only
   -z text                     Report error if DT_TEXTREL is set
     -z notext
     -z textoff
 
-mold: supported targets: elf32-i386 elf64-x86-64 elf32-littlearm elf64-littleaarch64 elf32-littleriscv elf32-bigriscv elf64-littleriscv elf64-bigriscv elf32-powerpc elf64-powerpc elf64-powerpc elf64-powerpcle elf64-s390 elf64-sparc elf32-m68k elf32-sh-linux elf64-alpha elf64-loongarch elf32-loongarch
-mold: supported emulations: elf_i386 elf_x86_64 armelf_linux_eabi aarch64linux aarch64elf elf32lriscv elf32briscv elf64lriscv elf64briscv elf32ppc elf32ppclinux elf64ppc elf64lppc elf64_s390 elf64_sparc m68kelf shlelf_linux elf64alpha elf64loongarch elf32loongarch)";
+mold: supported targets: elf32-i386 elf64-x86-64 elf32-littlearm elf64-littleaarch64 elf32-littleriscv elf32-bigriscv elf64-littleriscv elf64-bigriscv elf32-powerpc elf64-powerpc elf64-powerpc elf64-powerpcle elf64-s390 elf64-sparc elf32-m68k elf32-sh-linux elf64-loongarch elf32-loongarch
+mold: supported emulations: elf_i386 elf_x86_64 armelf_linux_eabi aarch64linux aarch64elf elf32lriscv elf32briscv elf64lriscv elf64briscv elf32ppc elf32ppclinux elf64ppc elf64lppc elf64_s390 elf64_sparc m68kelf shlelf_linux elf64loongarch elf32loongarch)";
 
 template <typename E>
 static std::vector<std::string_view>
@@ -365,6 +373,15 @@ static i64 parse_number(Context<E> &ctx, std::string opt,
   return ret;
 }
 
+static char from_hex(char c) {
+  if ('0' <= c && c <= '9')
+    return c - '0';
+  if ('a' <= c && c <= 'f')
+    return c - 'a' + 10;
+  assert('A' <= c && c <= 'F');
+  return c - 'A' + 10;
+}
+
 template <typename E>
 static std::vector<u8> parse_hex_build_id(Context<E> &ctx, std::string_view arg) {
   auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript;
@@ -373,23 +390,34 @@ static std::vector<u8> parse_hex_build_id(Context<E> &ctx, std::string_view arg)
   if (!std::regex_match(arg.begin(), arg.end(), re))
     Fatal(ctx) << "invalid build-id: " << arg;
 
-  arg = arg.substr(2);
-
-  auto fn = [](char c) {
-    if ('0' <= c && c <= '9')
-      return c - '0';
-    if ('a' <= c && c <= 'f')
-      return c - 'a' + 10;
-    assert('A' <= c && c <= 'F');
-    return c - 'A' + 10;
-  };
-
   std::vector<u8> vec;
-  for (i64 i = 0; i < arg.size(); i += 2)
-    vec.push_back((fn(arg[i]) << 4) | fn(arg[i + 1]));
+  for (i64 i = 2; i < arg.size(); i += 2)
+    vec.push_back((from_hex(arg[i]) << 4) | from_hex(arg[i + 1]));
   return vec;
 }
 
+template <typename E>
+static std::string
+parse_encoded_package_metadata(Context<E> &ctx, std::string_view arg) {
+  auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript;
+  static std::regex re(R"(([^%]|%[0-9a-fA-F][0-9a-fA-F])*)", flags);
+
+  if (!std::regex_match(arg.begin(), arg.end(), re))
+    Fatal(ctx) << "--encoded-package-metadata: invalid string: " << arg;
+
+  std::ostringstream out;
+  while (!arg.empty()) {
+    if (arg[0] == '%') {
+      out << (char)((from_hex(arg[1]) << 4) | from_hex(arg[2]));
+      arg = arg.substr(3);
+    } else {
+      out << arg[0];
+      arg = arg.substr(1);
+    }
+  }
+  return out.str();
+}
+
 static std::vector<std::string_view>
 split_by_comma_or_colon(std::string_view str) {
   std::vector<std::string_view> vec;
@@ -401,7 +429,7 @@ split_by_comma_or_colon(std::string_view str) {
       break;
     }
     vec.push_back(str.substr(0, pos));
-    str = str.substr(pos);
+    str = str.substr(pos + 1);
   }
   return vec;
 }
@@ -410,8 +438,7 @@ template <typename E>
 static void read_retain_symbols_file(Context<E> &ctx, std::string_view path) {
   MappedFile *mf = must_open_file(ctx, std::string(path));
   std::string_view data((char *)mf->data, mf->size);
-
-  ctx.arg.retain_symbols_file.reset(new std::unordered_set<std::string_view>);
+  std::vector<Symbol<E> *> vec;
 
   while (!data.empty()) {
     size_t pos = data.find('\n');
@@ -427,8 +454,10 @@ static void read_retain_symbols_file(Context<E> &ctx, std::string_view path) {
 
     name = string_trim(name);
     if (!name.empty())
-      ctx.arg.retain_symbols_file->insert(name);
+      vec.push_back(get_symbol(ctx, name));
   }
+
+  ctx.arg.retain_symbols_file = std::move(vec);
 }
 
 static bool is_file(std::string_view path) {
@@ -526,8 +555,10 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
   std::optional<SeparateCodeKind> z_separate_code;
   std::optional<bool> report_undefined;
   std::optional<bool> z_relro;
+  std::optional<std::string> separate_debug_file;
   std::optional<u64> shuffle_sections_seed;
   std::unordered_set<std::string_view> rpaths;
+  std::vector<std::string_view> version_scripts;
 
   auto add_rpath = [&](std::string_view arg) {
     if (rpaths.insert(arg).second) {
@@ -537,9 +568,9 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     }
   };
 
-  // RISC-V object files contains lots of local symbols, so by default
-  // we discard them. This is compatible with GNU ld.
-  if constexpr (is_riscv<E>)
+  // RISC-V and LoongArch object files contains lots of local symbols,
+  // so by default we discard them. This is compatible with GNU ld.
+  if constexpr (is_riscv<E> || is_loongarch<E>)
     ctx.arg.discard_locals = true;
 
   // We generally don't need to write addends to relocated places if the
@@ -554,8 +585,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
   //
   // - Static PIE binaries crash on startup in some RISC-V environment if
   //   we write addends to relocated places.
-  if constexpr (is_sparc<E> || is_riscv<E>)
-    ctx.arg.apply_dynamic_relocs = false;
+  ctx.arg.apply_dynamic_relocs = !is_sparc<E> && !is_riscv<E>;
 
   auto read_arg = [&](std::string name) {
     for (const std::string &opt : add_dashes(name)) {
@@ -652,7 +682,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
                << "   elf64briscv\n   elf32lriscv\n   elf32briscv\n"
                << "   elf32ppc\n   elf64ppc\n   elf64lppc\n   elf64_s390\n"
                << "   elf64_sparc\n   m68kelf\n   shlelf_linux\n"
-               << "   elf64alpha\n   elf64loongarch\n   elf32loongarch";
+               << "   elf64loongarch\n   elf32loongarch";
       version_shown = true;
     } else if (read_arg("m")) {
       if (arg == "elf_x86_64") {
@@ -685,8 +715,6 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
         ctx.arg.emulation = M68K::target_name;
       } else if (arg == "shlelf_linux") {
         ctx.arg.emulation = SH4::target_name;
-      } else if (arg == "elf64alpha") {
-        ctx.arg.emulation = ALPHA::target_name;
       } else if (arg == "elf64loongarch") {
         ctx.arg.emulation = LOONGARCH64::target_name;
       } else if (arg == "elf32loongarch") {
@@ -725,10 +753,10 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_flag("print-map") || read_flag("M")) {
       ctx.arg.print_map = true;
     } else if (read_flag("Bstatic") || read_flag("dn") || read_flag("static")) {
-      ctx.arg.is_static = true;
+      ctx.arg.static_ = true;
       remaining.push_back("--Bstatic");
     } else if (read_flag("Bdynamic") || read_flag("dy")) {
-      ctx.arg.is_static = false;
+      ctx.arg.static_ = false;
       remaining.push_back("--Bdynamic");
     } else if (read_flag("shared") || read_flag("Bshareable")) {
       ctx.arg.shared = true;
@@ -757,6 +785,10 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       ctx.arg.demangle = true;
     } else if (read_flag("no-demangle")) {
       ctx.arg.demangle = false;
+    } else if (read_flag("detach")) {
+      ctx.arg.detach = true;
+    } else if (read_flag("no-detach")) {
+      ctx.arg.detach = false;
     } else if (read_flag("default-symver")) {
       ctx.arg.default_symver = true;
     } else if (read_flag("noinhibit-exec")) {
@@ -863,6 +895,8 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_flag("pack-dyn-relocs=none") ||
                read_z_flag("nopack-relative-relocs")) {
       ctx.arg.pack_dyn_relocs_relr = false;
+    } else if (read_arg("encoded-package-metadata")) {
+      ctx.arg.package_metadata = parse_encoded_package_metadata(ctx, arg);
     } else if (read_arg("package-metadata")) {
       ctx.arg.package_metadata = arg;
     } else if (read_flag("stats")) {
@@ -908,7 +942,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       ctx.arg.wrap.insert(arg);
     } else if (read_flag("omagic") || read_flag("N")) {
       ctx.arg.omagic = true;
-      ctx.arg.is_static = true;
+      ctx.arg.static_ = true;
     } else if (read_flag("no-omagic")) {
       ctx.arg.omagic = false;
     } else if (read_arg("oformat")) {
@@ -1004,6 +1038,12 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       ctx.arg.z_origin = true;
     } else if (read_z_flag("nodefaultlib")) {
       ctx.arg.z_nodefaultlib = true;
+    } else if (read_eq("separate-debug-file")) {
+      separate_debug_file = arg;
+    } else if (read_flag("separate-debug-file")) {
+      separate_debug_file = "";
+    } else if (read_flag("no-separate-debug-file")) {
+      separate_debug_file.reset();
     } else if (read_z_flag("separate-loadable-segments")) {
       z_separate_code = SEPARATE_LOADABLE_SEGMENTS;
     } else if (read_z_flag("separate-code")) {
@@ -1020,10 +1060,20 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       ctx.arg.z_sectionheader = true;
     } else if (read_z_flag("nosectionheader")) {
       ctx.arg.z_sectionheader = false;
-    } else if (read_z_flag("rewrite-endbr")) {
-      ctx.arg.z_rewrite_endbr = true;
     } else if (read_z_flag("rodynamic")) {
       ctx.arg.z_rodynamic = true;
+    } else if (read_z_flag("x86-64-v2")) {
+      ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V2;
+    } else if (read_z_flag("x86-64-v3")) {
+      ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V3;
+    } else if (read_z_flag("x86-64-v4")) {
+      ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V4;
+    } else if (read_z_flag("rewrite-endbr")) {
+      if constexpr (!is_x86_64<E>)
+        Fatal(ctx) << "-z rewrite-endbr is supported only on x86-64";
+      ctx.arg.z_rewrite_endbr = true;
+    } else if (read_z_flag("norewrite-endbr")) {
+      ctx.arg.z_rewrite_endbr = false;
     } else if (read_flag("nmagic")) {
       ctx.arg.nmagic = true;
     } else if (read_flag("no-nmagic")) {
@@ -1181,7 +1231,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       } else if (arg == "sha1") {
         ctx.arg.build_id.kind = BuildId::HASH;
         ctx.arg.build_id.hash_size = 20;
-      } else if (arg == "sha256") {
+      } else if (arg == "sha256" || arg == "fast") {
         ctx.arg.build_id.kind = BuildId::HASH;
         ctx.arg.build_id.hash_size = 32;
       } else if (arg.starts_with("0x") || arg.starts_with("0X")) {
@@ -1203,6 +1253,10 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       ctx.arg.auxiliary.push_back(arg);
     } else if (read_arg("filter") || read_arg("F")) {
       ctx.arg.filter.push_back(arg);
+    } else if (read_flag("allow-shlib-undefined")) {
+      ctx.arg.allow_shlib_undefined = true;
+    } else if (read_flag("no-allow-shlib-undefined")) {
+      ctx.arg.allow_shlib_undefined = false;
     } else if (read_arg("O")) {
     } else if (read_flag("EB")) {
     } else if (read_flag("EL")) {
@@ -1220,8 +1274,6 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_flag("enable-new-dtags")) {
     } else if (read_flag("disable-new-dtags")) {
     } else if (read_flag("nostdlib")) {
-    } else if (read_flag("allow-shlib-undefined")) {
-    } else if (read_flag("no-allow-shlib-undefined")) {
     } else if (read_flag("no-add-needed")) {
     } else if (read_flag("no-call-graph-profile-sort")) {
     } else if (read_flag("no-copy-dt-needed-entries")) {
@@ -1246,17 +1298,12 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_flag("no-keep-memory")) {
     } else if (read_arg("max-cache-size")) {
     } else if (read_arg("version-script")) {
-      // --version-script is treated as positional arguments even though
-      // they are actually not positional. This is because linker scripts
-      // (a positional argument) can also specify a version script, and
-      // it's better to consolidate parsing in read_input_files. In
-      // particular, version scripts can modify ctx.default_version which
-      // we initialize *after* parsing non-positional args, so the parsing
-      // cannot be done right here.
-      remaining.push_back("--version-script=" + std::string(arg));
+      version_scripts.push_back(arg);
     } else if (read_arg("dynamic-list")) {
       ctx.arg.Bsymbolic = BSYMBOLIC_ALL;
       append(ctx.dynamic_list_patterns, parse_dynamic_list(ctx, arg));
+    } else if (read_arg("dynamic-list-data")) {
+      ctx.arg.dynamic_list_data = true;
     } else if (read_arg("export-dynamic-symbol")) {
       ctx.dynamic_list_patterns.push_back({arg, "<command line>"});
     } else if (read_arg("export-dynamic-symbol-list")) {
@@ -1269,7 +1316,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       remaining.push_back("--whole-archive");
     } else if (read_flag("no-whole-archive")) {
       remaining.push_back("--no-whole-archive");
-    } else if (read_arg("l")) {
+    } else if (read_arg("l") || read_arg("library")) {
       remaining.push_back("-l" + std::string(arg));
     } else if (read_arg("script") || read_arg("T")) {
       remaining.push_back(std::string(arg));
@@ -1287,7 +1334,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       Fatal(ctx) << "unknown command line option: -dynamic; -dynamic is a "
                  << "macOS linker's option. mold does not support macOS.";
     } else {
-      if (args[0][0] == '-')
+      if (args[0].starts_with('-'))
         Fatal(ctx) << "unknown command line option: " << args[0];
       remaining.push_back(std::string(args[0]));
       args = args.subspan(1);
@@ -1332,7 +1379,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
   }
 
   if (ctx.arg.relocatable)
-    ctx.arg.is_static = true;
+    ctx.arg.static_ = true;
 
   if (ctx.arg.shuffle_sections == SHUFFLE_SECTIONS_SHUFFLE) {
     if (shuffle_sections_seed)
@@ -1395,9 +1442,35 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     ctx.default_version = VER_NDX_LAST_RESERVED + 1;
   }
 
+  for (std::string_view path : version_scripts) {
+    auto open = [&] {
+      if (MappedFile *mf = open_file(ctx, std::string(path)))
+        return mf;
+      for (std::string_view dir : ctx.arg.library_paths)
+        if (MappedFile *mf =
+            open_file(ctx, std::string(dir) + "/" + std::string(path)))
+          return mf;
+      Fatal(ctx) << "--version-script: file not found: " << path;
+    };
+
+    ReaderContext rctx;
+    Script(ctx, rctx, open()).parse_version_script();
+  }
+
+  if (separate_debug_file) {
+    if (separate_debug_file->empty())
+      ctx.arg.separate_debug_file = ctx.arg.output + ".dbg";
+    else
+      ctx.arg.separate_debug_file = *separate_debug_file;
+  }
+
   if (ctx.arg.shared && warn_shared_textrel)
     ctx.arg.warn_textrel = true;
 
+  // We don't want the background process to write to stdout
+  if (ctx.arg.stats || ctx.arg.perf)
+    ctx.arg.detach = false;
+
   ctx.arg.undefined.push_back(ctx.arg.entry);
 
   for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) {
@@ -1450,4 +1523,4 @@ using E = MOLD_TARGET;
 template std::vector<std::string_view> expand_response_files(Context<E> &, char **);
 template std::vector<std::string> parse_nonpositional_args(Context<E> &ctx);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/config.cc b/src/config.cc
similarity index 85%
rename from elf/config.cc
rename to src/config.cc
index 55db9603..af578ab8 100644
--- a/elf/config.cc
+++ b/src/config.cc
@@ -1,7 +1,7 @@
 #include "mold.h"
 #include "config.h"
 
-namespace mold::elf {
+namespace mold {
 
 std::string get_mold_version() {
   if (mold_git_hash.empty())
@@ -10,4 +10,4 @@ std::string get_mold_version() {
          "; compatible with GNU ld)";
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/elf.cc b/src/elf.cc
similarity index 96%
rename from elf/elf.cc
rename to src/elf.cc
index 2ce2ec47..8f78df67 100644
--- a/elf/elf.cc
+++ b/src/elf.cc
@@ -1,6 +1,6 @@
-#include "mold.h"
+#include "elf.h"
 
-namespace mold::elf {
+namespace mold {
 
 static std::string unknown_type(u32 r_type) {
   char buf[50];
@@ -890,46 +890,6 @@ std::string rel_to_string<SH4>(u32 r_type) {
   return unknown_type(r_type);
 }
 
-template <>
-std::string rel_to_string<ALPHA>(u32 r_type) {
-  switch (r_type) {
-  CASE(R_ALPHA_NONE);
-  CASE(R_ALPHA_REFLONG);
-  CASE(R_ALPHA_REFQUAD);
-  CASE(R_ALPHA_GPREL32);
-  CASE(R_ALPHA_LITERAL);
-  CASE(R_ALPHA_LITUSE);
-  CASE(R_ALPHA_GPDISP);
-  CASE(R_ALPHA_BRADDR);
-  CASE(R_ALPHA_HINT);
-  CASE(R_ALPHA_SREL16);
-  CASE(R_ALPHA_SREL32);
-  CASE(R_ALPHA_SREL64);
-  CASE(R_ALPHA_GPRELHIGH);
-  CASE(R_ALPHA_GPRELLOW);
-  CASE(R_ALPHA_GPREL16);
-  CASE(R_ALPHA_COPY);
-  CASE(R_ALPHA_GLOB_DAT);
-  CASE(R_ALPHA_JMP_SLOT);
-  CASE(R_ALPHA_RELATIVE);
-  CASE(R_ALPHA_BRSGP);
-  CASE(R_ALPHA_TLSGD);
-  CASE(R_ALPHA_TLSLDM);
-  CASE(R_ALPHA_DTPMOD64);
-  CASE(R_ALPHA_GOTDTPREL);
-  CASE(R_ALPHA_DTPREL64);
-  CASE(R_ALPHA_DTPRELHI);
-  CASE(R_ALPHA_DTPRELLO);
-  CASE(R_ALPHA_DTPREL16);
-  CASE(R_ALPHA_GOTTPREL);
-  CASE(R_ALPHA_TPREL64);
-  CASE(R_ALPHA_TPRELHI);
-  CASE(R_ALPHA_TPRELLO);
-  CASE(R_ALPHA_TPREL16);
-  }
-  return unknown_type(r_type);
-}
-
 template <>
 std::string rel_to_string<LOONGARCH64>(u32 r_type) {
   switch (r_type) {
@@ -946,6 +906,8 @@ std::string rel_to_string<LOONGARCH64>(u32 r_type) {
   CASE(R_LARCH_TLS_TPREL32);
   CASE(R_LARCH_TLS_TPREL64);
   CASE(R_LARCH_IRELATIVE);
+  CASE(R_LARCH_TLS_DESC32);
+  CASE(R_LARCH_TLS_DESC64);
   CASE(R_LARCH_MARK_LA);
   CASE(R_LARCH_MARK_PCREL);
   CASE(R_LARCH_SOP_PUSH_PCREL);
@@ -1031,6 +993,23 @@ std::string rel_to_string<LOONGARCH64>(u32 r_type) {
   CASE(R_LARCH_ADD_ULEB128);
   CASE(R_LARCH_SUB_ULEB128);
   CASE(R_LARCH_64_PCREL);
+  CASE(R_LARCH_CALL36);
+  CASE(R_LARCH_TLS_DESC_PC_HI20);
+  CASE(R_LARCH_TLS_DESC_PC_LO12);
+  CASE(R_LARCH_TLS_DESC64_PC_LO20);
+  CASE(R_LARCH_TLS_DESC64_PC_HI12);
+  CASE(R_LARCH_TLS_DESC_HI20);
+  CASE(R_LARCH_TLS_DESC_LO12);
+  CASE(R_LARCH_TLS_DESC64_LO20);
+  CASE(R_LARCH_TLS_DESC64_HI12);
+  CASE(R_LARCH_TLS_DESC_LD);
+  CASE(R_LARCH_TLS_DESC_CALL);
+  CASE(R_LARCH_TLS_LE_HI20_R);
+  CASE(R_LARCH_TLS_LE_ADD_R);
+  CASE(R_LARCH_TLS_LE_LO12_R);
+  CASE(R_LARCH_TLS_LD_PCREL20_S2);
+  CASE(R_LARCH_TLS_GD_PCREL20_S2);
+  CASE(R_LARCH_TLS_DESC_PCREL20_S2);
   }
   return unknown_type(r_type);
 }
@@ -1040,4 +1019,4 @@ std::string rel_to_string<LOONGARCH32>(u32 r_type) {
   return rel_to_string<LOONGARCH64>(r_type);
 }
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/elf.h b/src/elf.h
similarity index 95%
rename from elf/elf.h
rename to src/elf.h
index c58fea05..08ca6db2 100644
--- a/elf/elf.h
+++ b/src/elf.h
@@ -1,13 +1,13 @@
 #pragma once
 
-#include "../common/integers.h"
+#include "../lib/integers.h"
 
 #include <concepts>
 #include <ostream>
 #include <string>
 #include <type_traits>
 
-namespace mold::elf {
+namespace mold {
 
 struct X86_64;
 struct I386;
@@ -24,7 +24,6 @@ struct S390X;
 struct SPARC64;
 struct M68K;
 struct SH4;
-struct ALPHA;
 struct LOONGARCH64;
 struct LOONGARCH32;
 
@@ -191,6 +190,7 @@ enum : u32 {
   PT_GNU_EH_FRAME = 0x6474e550,
   PT_GNU_STACK = 0x6474e551,
   PT_GNU_RELRO = 0x6474e552,
+  PT_GNU_PROPERTY = 0x6474e553,
   PT_OPENBSD_RANDOMIZE = 0x65a3dbe6,
   PT_ARM_EXIDX = 0x70000001,
   PT_RISCV_ATTRIBUTES = 0x70000003,
@@ -238,7 +238,6 @@ enum : u32 {
   EM_AARCH64 = 183,
   EM_RISCV = 243,
   EM_LOONGARCH = 258,
-  EM_ALPHA = 0x9026,
 };
 
 enum : u32 {
@@ -342,6 +341,12 @@ enum : u32 {
   GNU_PROPERTY_X86_FEATURE_1_IBT = 1,
   GNU_PROPERTY_X86_FEATURE_1_SHSTK = 2,
   GNU_PROPERTY_X86_FEATURE_1_AND = 0xc0000002,
+
+  GNU_PROPERTY_X86_ISA_1_NEEDED = 0xc0008002,
+  GNU_PROPERTY_X86_ISA_1_BASELINE = 1,
+  GNU_PROPERTY_X86_ISA_1_V2 = 2,
+  GNU_PROPERTY_X86_ISA_1_V3 = 4,
+  GNU_PROPERTY_X86_ISA_1_V4 = 8,
 };
 
 enum : u32 {
@@ -379,8 +384,6 @@ enum : u32 {
 
 enum : u32 {
   STO_RISCV_VARIANT_CC = 0x80,
-  STO_ALPHA_NOPV = 0x20,
-  STO_ALPHA_STD_GPLOAD = 0x22,
 };
 
 enum : u32 {
@@ -1227,42 +1230,6 @@ enum : u32 {
   R_SH_GOTPLT32 = 168,
 };
 
-enum : u32 {
-  R_ALPHA_NONE = 0,
-  R_ALPHA_REFLONG = 1,
-  R_ALPHA_REFQUAD = 2,
-  R_ALPHA_GPREL32 = 3,
-  R_ALPHA_LITERAL = 4,
-  R_ALPHA_LITUSE = 5,
-  R_ALPHA_GPDISP = 6,
-  R_ALPHA_BRADDR = 7,
-  R_ALPHA_HINT = 8,
-  R_ALPHA_SREL16 = 9,
-  R_ALPHA_SREL32 = 10,
-  R_ALPHA_SREL64 = 11,
-  R_ALPHA_GPRELHIGH = 17,
-  R_ALPHA_GPRELLOW = 18,
-  R_ALPHA_GPREL16 = 19,
-  R_ALPHA_COPY = 24,
-  R_ALPHA_GLOB_DAT = 25,
-  R_ALPHA_JMP_SLOT = 26,
-  R_ALPHA_RELATIVE = 27,
-  R_ALPHA_BRSGP = 28,
-  R_ALPHA_TLSGD = 29,
-  R_ALPHA_TLSLDM = 30,
-  R_ALPHA_DTPMOD64 = 31,
-  R_ALPHA_GOTDTPREL = 32,
-  R_ALPHA_DTPREL64 = 33,
-  R_ALPHA_DTPRELHI = 34,
-  R_ALPHA_DTPRELLO = 35,
-  R_ALPHA_DTPREL16 = 36,
-  R_ALPHA_GOTTPREL = 37,
-  R_ALPHA_TPREL64 = 38,
-  R_ALPHA_TPRELHI = 39,
-  R_ALPHA_TPRELLO = 40,
-  R_ALPHA_TPREL16 = 41,
-};
-
 enum : u32 {
   R_LARCH_NONE = 0,
   R_LARCH_32 = 1,
@@ -1277,6 +1244,8 @@ enum : u32 {
   R_LARCH_TLS_TPREL32 = 10,
   R_LARCH_TLS_TPREL64 = 11,
   R_LARCH_IRELATIVE = 12,
+  R_LARCH_TLS_DESC32 = 13,
+  R_LARCH_TLS_DESC64 = 14,
   R_LARCH_MARK_LA = 20,
   R_LARCH_MARK_PCREL = 21,
   R_LARCH_SOP_PUSH_PCREL = 22,
@@ -1362,6 +1331,23 @@ enum : u32 {
   R_LARCH_ADD_ULEB128 = 107,
   R_LARCH_SUB_ULEB128 = 108,
   R_LARCH_64_PCREL = 109,
+  R_LARCH_CALL36 = 110,
+  R_LARCH_TLS_DESC_PC_HI20 = 111,
+  R_LARCH_TLS_DESC_PC_LO12 = 112,
+  R_LARCH_TLS_DESC64_PC_LO20 = 113,
+  R_LARCH_TLS_DESC64_PC_HI12 = 114,
+  R_LARCH_TLS_DESC_HI20 = 115,
+  R_LARCH_TLS_DESC_LO12 = 116,
+  R_LARCH_TLS_DESC64_LO20 = 117,
+  R_LARCH_TLS_DESC64_HI12 = 118,
+  R_LARCH_TLS_DESC_LD = 119,
+  R_LARCH_TLS_DESC_CALL = 120,
+  R_LARCH_TLS_LE_HI20_R = 121,
+  R_LARCH_TLS_LE_ADD_R = 122,
+  R_LARCH_TLS_LE_LO12_R = 123,
+  R_LARCH_TLS_LD_PCREL20_S2 = 124,
+  R_LARCH_TLS_GD_PCREL20_S2 = 125,
+  R_LARCH_TLS_DESC_PCREL20_S2 = 126,
 };
 
 //
@@ -1786,33 +1772,6 @@ struct ElfSym<PPC64V2> {
   ul64 st_size;
 };
 
-template <>
-struct ElfSym<ALPHA> {
-  bool is_undef() const { return st_shndx == SHN_UNDEF; }
-  bool is_abs() const { return st_shndx == SHN_ABS; }
-  bool is_common() const { return st_shndx == SHN_COMMON; }
-  bool is_weak() const { return st_bind == STB_WEAK; }
-  bool is_undef_weak() const { return is_undef() && is_weak(); }
-
-  ul32 st_name;
-
-#ifdef __LITTLE_ENDIAN__
-  u8 st_type : 4;
-  u8 st_bind : 4;
-  u8 st_visibility : 2;
-  u8 alpha_st_other : 6; // contains STO_ALPHA_NOPV, STO_ALPHA_STD_GPLOAD or 0
-#else
-  u8 st_bind : 4;
-  u8 st_type : 4;
-  u8 alpha_st_other : 6;
-  u8 st_visibility : 2;
-#endif
-
-  ul16 st_shndx;
-  ul64 st_value;
-  ul64 st_size;
-};
-
 template <>
 struct ElfRel<SPARC64> {
   ElfRel() = default;
@@ -1831,7 +1790,7 @@ template <>
 struct ElfRel<SH4> {
   ElfRel() = default;
 
-  // Addend is ignored except for base relocations because  even though
+  // Addend is ignored except for base relocations because even though
   // SH4 is RELA, r_addend is ignored in most cases and works as if it
   // were REL.
   ElfRel(u64 offset, u32 type, u32 sym, i64 addend)
@@ -1866,7 +1825,6 @@ template <typename E> concept is_s390x = std::same_as<E, S390X>;
 template <typename E> concept is_sparc64 = std::same_as<E, SPARC64>;
 template <typename E> concept is_m68k = std::same_as<E, M68K>;
 template <typename E> concept is_sh4 = std::same_as<E, SH4>;
-template <typename E> concept is_alpha = std::same_as<E, ALPHA>;
 template <typename E> concept is_loongarch64 = std::same_as<E, LOONGARCH64>;
 template <typename E> concept is_loongarch32 = std::same_as<E, LOONGARCH32>;
 
@@ -2215,29 +2173,6 @@ struct SH4 {
   static constexpr u32 R_FUNCALL[] = { R_SH_PLT32 };
 };
 
-struct ALPHA {
-  static constexpr std::string_view target_name = "alpha";
-  static constexpr bool is_64 = true;
-  static constexpr bool is_le = true;
-  static constexpr bool is_rela = true;
-  static constexpr u32 page_size = 65536;
-  static constexpr u32 e_machine = EM_ALPHA;
-  static constexpr u32 plt_hdr_size = 0;
-  static constexpr u32 plt_size = 0;
-  static constexpr u32 pltgot_size = 0;
-  static constexpr u8 filler[] = { 0x81, 0x00, 0x00, 0x00 }; // bugchk
-
-  static constexpr u32 R_COPY = R_ALPHA_COPY;
-  static constexpr u32 R_GLOB_DAT = R_ALPHA_GLOB_DAT;
-  static constexpr u32 R_JUMP_SLOT = R_ALPHA_JMP_SLOT;
-  static constexpr u32 R_ABS = R_ALPHA_REFQUAD;
-  static constexpr u32 R_RELATIVE = R_ALPHA_RELATIVE;
-  static constexpr u32 R_DTPOFF = R_ALPHA_DTPREL64;
-  static constexpr u32 R_TPOFF = R_ALPHA_TPREL64;
-  static constexpr u32 R_DTPMOD = R_ALPHA_DTPMOD64;
-  static constexpr u32 R_FUNCALL[] = {};
-};
-
 struct LOONGARCH64 {
   static constexpr std::string_view target_name = "loongarch64";
   static constexpr bool is_64 = true;
@@ -2248,8 +2183,6 @@ struct LOONGARCH64 {
   static constexpr u32 plt_hdr_size = 32;
   static constexpr u32 plt_size = 16;
   static constexpr u32 pltgot_size = 16;
-  static constexpr u32 thunk_hdr_size = 0;
-  static constexpr u32 thunk_size = 8;
   static constexpr u8 filler[] = { 0x00, 0x00, 0x2a, 0x00 }; // break 0
 
   static constexpr u32 R_COPY = R_LARCH_COPY;
@@ -2261,7 +2194,8 @@ struct LOONGARCH64 {
   static constexpr u32 R_DTPOFF = R_LARCH_TLS_DTPREL64;
   static constexpr u32 R_TPOFF = R_LARCH_TLS_TPREL64;
   static constexpr u32 R_DTPMOD = R_LARCH_TLS_DTPMOD64;
-  static constexpr u32 R_FUNCALL[] = { R_LARCH_B26 };
+  static constexpr u32 R_TLSDESC = R_LARCH_TLS_DESC64;
+  static constexpr u32 R_FUNCALL[] = { R_LARCH_B26, R_LARCH_CALL36 };
 };
 
 struct LOONGARCH32 {
@@ -2274,8 +2208,6 @@ struct LOONGARCH32 {
   static constexpr u32 plt_hdr_size = 32;
   static constexpr u32 plt_size = 16;
   static constexpr u32 pltgot_size = 16;
-  static constexpr u32 thunk_hdr_size = 0;
-  static constexpr u32 thunk_size = 8;
   static constexpr u8 filler[] = { 0x00, 0x00, 0x2a, 0x00 }; // break 0
 
   static constexpr u32 R_COPY = R_LARCH_COPY;
@@ -2287,7 +2219,8 @@ struct LOONGARCH32 {
   static constexpr u32 R_DTPOFF = R_LARCH_TLS_DTPREL32;
   static constexpr u32 R_TPOFF = R_LARCH_TLS_TPREL32;
   static constexpr u32 R_DTPMOD = R_LARCH_TLS_DTPMOD32;
-  static constexpr u32 R_FUNCALL[] = { R_LARCH_B26 };
+  static constexpr u32 R_TLSDESC = R_LARCH_TLS_DESC32;
+  static constexpr u32 R_FUNCALL[] = { R_LARCH_B26, R_LARCH_CALL36 };
 };
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/common/filetype.h b/src/filetype.h
similarity index 64%
rename from common/filetype.h
rename to src/filetype.h
index b2c46578..50b605da 100644
--- a/common/filetype.h
+++ b/src/filetype.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "common.h"
-#include "../elf/elf.h"
+#include "../lib/common.h"
+#include "elf.h"
 
 namespace mold {
 
@@ -10,21 +10,14 @@ enum class FileType {
   EMPTY,
   ELF_OBJ,
   ELF_DSO,
-  MACH_OBJ,
-  MACH_EXE,
-  MACH_DYLIB,
-  MACH_BUNDLE,
-  MACH_UNIVERSAL,
   AR,
   THIN_AR,
-  TAPI,
   TEXT,
   GCC_LTO_OBJ,
   LLVM_BITCODE,
 };
 
-template <typename MappedFile>
-bool is_text_file(MappedFile *mf) {
+inline bool is_text_file(MappedFile *mf) {
   auto istext = [](char c) {
     return isprint(c) || c == '\n' || c == '\t';
   };
@@ -34,10 +27,8 @@ bool is_text_file(MappedFile *mf) {
          istext(data[2]) && istext(data[3]);
 }
 
-template <typename E, typename Context, typename MappedFile>
-inline bool is_gcc_lto_obj(Context &ctx, MappedFile *mf) {
-  using namespace mold::elf;
-
+template <typename E>
+inline bool is_gcc_lto_obj(MappedFile *mf, bool has_plugin) {
   const char *data = mf->get_contents().data();
   ElfEhdr<E> &ehdr = *(ElfEhdr<E> *)data;
   ElfShdr<E> *sh_begin = (ElfShdr<E> *)(data + ehdr.e_shoff);
@@ -54,7 +45,7 @@ inline bool is_gcc_lto_obj(Context &ctx, MappedFile *mf) {
     // the LTO linker plugin is available and falls back as regular
     // objects otherwise. GCC FAT LTO object can be identified by the
     // presence of `.gcc.lto_.symtab` section.
-    if (!ctx.arg.plugin.empty()) {
+    if (has_plugin) {
       std::string_view name = data + shdrs[shstrtab_idx].sh_offset + sec.sh_name;
       if (name.starts_with(".gnu.lto_.symtab."))
         return true;
@@ -89,11 +80,10 @@ inline bool is_gcc_lto_obj(Context &ctx, MappedFile *mf) {
   return false;
 }
 
-template <typename Context, typename MappedFile>
-FileType get_file_type(Context &ctx, MappedFile *mf) {
-  using namespace elf;
-
+template <typename E>
+FileType get_file_type(Context<E> &ctx, MappedFile *mf) {
   std::string_view data = mf->get_contents();
+  bool has_plugin = !ctx.arg.plugin.empty();
 
   if (data.empty())
     return FileType::EMPTY;
@@ -106,10 +96,10 @@ FileType get_file_type(Context &ctx, MappedFile *mf) {
 
       if (ehdr.e_type == ET_REL) {
         if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) {
-          if (is_gcc_lto_obj<I386>(ctx, mf))
+          if (is_gcc_lto_obj<I386>(mf, has_plugin))
             return FileType::GCC_LTO_OBJ;
         } else {
-          if (is_gcc_lto_obj<X86_64>(ctx, mf))
+          if (is_gcc_lto_obj<X86_64>(mf, has_plugin))
             return FileType::GCC_LTO_OBJ;
         }
         return FileType::ELF_OBJ;
@@ -122,10 +112,10 @@ FileType get_file_type(Context &ctx, MappedFile *mf) {
 
       if (ehdr.e_type == ET_REL) {
         if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) {
-          if (is_gcc_lto_obj<M68K>(ctx, mf))
+          if (is_gcc_lto_obj<M68K>(mf, has_plugin))
             return FileType::GCC_LTO_OBJ;
         } else {
-          if (is_gcc_lto_obj<SPARC64>(ctx, mf))
+          if (is_gcc_lto_obj<SPARC64>(mf, has_plugin))
             return FileType::GCC_LTO_OBJ;
         }
         return FileType::ELF_OBJ;
@@ -137,28 +127,10 @@ FileType get_file_type(Context &ctx, MappedFile *mf) {
     return FileType::UNKNOWN;
   }
 
-  if (data.starts_with("\xcf\xfa\xed\xfe")) {
-    switch (*(ul32 *)(data.data() + 12)) {
-    case 1: // MH_OBJECT
-      return FileType::MACH_OBJ;
-    case 2: // MH_EXECUTE
-      return FileType::MACH_EXE;
-    case 6: // MH_DYLIB
-      return FileType::MACH_DYLIB;
-    case 8: // MH_BUNDLE
-      return FileType::MACH_BUNDLE;
-    }
-    return FileType::UNKNOWN;
-  }
-
   if (data.starts_with("!<arch>\n"))
     return FileType::AR;
   if (data.starts_with("!<thin>\n"))
     return FileType::THIN_AR;
-  if (data.starts_with("--- !tapi-tbd"))
-    return FileType::TAPI;
-  if (data.starts_with("\xca\xfe\xba\xbe"))
-    return FileType::MACH_UNIVERSAL;
   if (is_text_file(mf))
     return FileType::TEXT;
   if (data.starts_with("\xde\xc0\x17\x0b"))
@@ -168,29 +140,23 @@ FileType get_file_type(Context &ctx, MappedFile *mf) {
   return FileType::UNKNOWN;
 }
 
-inline std::string filetype_to_string(FileType type) {
-  switch (type) {
-  case FileType::UNKNOWN: return "UNKNOWN";
-  case FileType::EMPTY: return "EMPTY";
-  case FileType::ELF_OBJ: return "ELF_OBJ";
-  case FileType::ELF_DSO: return "ELF_DSO";
-  case FileType::MACH_EXE: return "MACH_EXE";
-  case FileType::MACH_OBJ: return "MACH_OBJ";
-  case FileType::MACH_DYLIB: return "MACH_DYLIB";
-  case FileType::MACH_BUNDLE: return "MACH_BUNDLE";
-  case FileType::MACH_UNIVERSAL: return "MACH_UNIVERSAL";
-  case FileType::AR: return "AR";
-  case FileType::THIN_AR: return "THIN_AR";
-  case FileType::TAPI: return "TAPI";
-  case FileType::TEXT: return "TEXT";
-  case FileType::GCC_LTO_OBJ: return "GCC_LTO_OBJ";
-  case FileType::LLVM_BITCODE: return "LLVM_BITCODE";
-  }
-  return "UNKNOWN";
-}
-
 inline std::ostream &operator<<(std::ostream &out, FileType type) {
-  out << filetype_to_string(type);
+  auto to_string = [&] {
+    switch (type) {
+    case FileType::UNKNOWN: return "UNKNOWN";
+    case FileType::EMPTY: return "EMPTY";
+    case FileType::ELF_OBJ: return "ELF_OBJ";
+    case FileType::ELF_DSO: return "ELF_DSO";
+    case FileType::AR: return "AR";
+    case FileType::THIN_AR: return "THIN_AR";
+    case FileType::TEXT: return "TEXT";
+    case FileType::GCC_LTO_OBJ: return "GCC_LTO_OBJ";
+    case FileType::LLVM_BITCODE: return "LLVM_BITCODE";
+    default: return "UNKNOWN";
+    }
+  };
+
+  out << to_string();
   return out;
 }
 
diff --git a/elf/gc-sections.cc b/src/gc-sections.cc
similarity index 99%
rename from elf/gc-sections.cc
rename to src/gc-sections.cc
index 34334a7c..efc6cd6d 100644
--- a/elf/gc-sections.cc
+++ b/src/gc-sections.cc
@@ -7,7 +7,7 @@
 #include <tbb/concurrent_vector.h>
 #include <tbb/parallel_for_each.h>
 
-namespace mold::elf {
+namespace mold {
 
 template <typename E>
 static bool should_keep(const InputSection<E> &isec) {
@@ -172,4 +172,4 @@ using E = MOLD_TARGET;
 
 template void gc_sections(Context<E> &ctx);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/gdb-index.cc b/src/gdb-index.cc
similarity index 99%
rename from elf/gdb-index.cc
rename to src/gdb-index.cc
index d13ec49d..a87b7691 100644
--- a/elf/gdb-index.cc
+++ b/src/gdb-index.cc
@@ -60,7 +60,7 @@
 #include <tbb/parallel_for_each.h>
 #include <tbb/parallel_sort.h>
 
-namespace mold::elf {
+namespace mold {
 
 enum DwarfKind { DWARF2_32, DWARF5_32, DWARF2_64, DWARF5_64 };
 
@@ -791,4 +791,4 @@ using E = MOLD_TARGET;
 
 template void write_gdb_index(Context<E> &);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/icf.cc b/src/icf.cc
similarity index 99%
rename from elf/icf.cc
rename to src/icf.cc
index cc64c626..cdf70760 100644
--- a/elf/icf.cc
+++ b/src/icf.cc
@@ -65,7 +65,7 @@
 // conditions.
 
 #include "mold.h"
-#include "../common/siphash.h"
+#include "../lib/siphash.h"
 
 #include <array>
 #include <cstdio>
@@ -91,7 +91,7 @@ template <> struct hash<Digest> {
 };
 }
 
-namespace mold::elf {
+namespace mold {
 
 static u8 hmac_key[16];
 
@@ -599,7 +599,7 @@ void icf_sections(Context<E> &ctx) {
     static Counter eliminated("icf_eliminated");
     tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
       for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
-        if (isec && isec->is_alive && isec->is_killed_by_icf()) {
+        if (isec && isec->is_alive && isec->icf_removed()) {
           isec->kill();
           eliminated++;
         }
@@ -612,4 +612,4 @@ using E = MOLD_TARGET;
 
 template void icf_sections(Context<E> &ctx);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/input-files.cc b/src/input-files.cc
similarity index 88%
rename from elf/input-files.cc
rename to src/input-files.cc
index 6d8ad8f4..afe1fc1e 100644
--- a/elf/input-files.cc
+++ b/src/input-files.cc
@@ -8,7 +8,7 @@
 # include <unistd.h>
 #endif
 
-namespace mold::elf {
+namespace mold {
 
 // If we haven't seen the same `key` before, create a new instance
 // of Symbol and returns it. Otherwise, returns the previously-
@@ -243,7 +243,7 @@ static bool is_known_section_type(const ElfShdr<E> &shdr) {
     return true;
   if (SHT_LOOS <= ty && ty <= SHT_HIOS && !(flags & SHF_OS_NONCONFORMING))
     return true;
-  if (is_x86<E> && ty == SHT_X86_64_UNWIND)
+  if (is_x86_64<E> && ty == SHT_X86_64_UNWIND)
     return true;
   if (is_arm32<E> && (ty == SHT_ARM_EXIDX || ty == SHT_ARM_ATTRIBUTES))
     return true;
@@ -564,11 +564,15 @@ void ObjectFile<E>::parse_ehframe(Context<E> &ctx) {
   for (i64 i = 0; i < fdes.size();) {
     InputSection<E> *isec = get_isec(fdes[i]);
     assert(isec->fde_begin == -1);
-    isec->fde_begin = i++;
 
-    while (i < fdes.size() && isec == get_isec(fdes[i]))
-      i++;
-    isec->fde_end = i;
+    if (isec->is_alive) {
+      isec->fde_begin = i++;
+      while (i < fdes.size() && isec == get_isec(fdes[i]))
+        i++;
+      isec->fde_end = i;
+    } else {
+      fdes[i++].is_alive = false;
+    }
   }
 }
 
@@ -677,102 +681,27 @@ void ObjectFile<E>::sort_relocations(Context<E> &ctx) {
   }
 }
 
-static size_t find_null(std::string_view data, i64 pos, i64 entsize) {
-  if (entsize == 1)
-    return data.find('\0', pos);
-
-  for (; pos <= data.size() - entsize; pos += entsize)
-    if (data.substr(pos, entsize).find_first_not_of('\0') == data.npos)
-      return pos;
-
-  return data.npos;
-}
-
-// Mergeable sections (sections with SHF_MERGE bit) typically contain
-// string literals. Linker is expected to split the section contents
-// into null-terminated strings, merge them with mergeable strings
-// from other object files, and emit uniquified strings to an output
-// file.
-//
-// This mechanism reduces the size of an output file. If two source
-// files happen to contain the same string literal, the output will
-// contain only a single copy of it.
-//
-// It is less common than string literals, but mergeable sections can
-// contain fixed-sized read-only records too.
-//
-// This function splits the section contents into small pieces that we
-// call "section fragments". Section fragment is a unit of merging.
-//
-// We do not support mergeable sections that have relocations.
 template <typename E>
-static std::unique_ptr<MergeableSection<E>>
-split_section(Context<E> &ctx, InputSection<E> &sec) {
-  if (!sec.is_alive || sec.relsec_idx != -1 || sec.sh_size == 0)
-    return nullptr;
-
-  const ElfShdr<E> &shdr = sec.shdr();
-  if (!(shdr.sh_flags & SHF_MERGE))
-    return nullptr;
-
-  i64 entsize = shdr.sh_entsize;
-  if (entsize == 0)
-    entsize = (shdr.sh_flags & SHF_STRINGS) ? 1 : (int)shdr.sh_addralign;
-
-  if (entsize == 0)
-    return nullptr;
-
-  i64 addralign = shdr.sh_addralign;
-  if (addralign == 0)
-    addralign = 1;
-
-  std::unique_ptr<MergeableSection<E>> m(new MergeableSection<E>);
-  m->parent = MergedSection<E>::get_instance(ctx, sec.name(), shdr.sh_type,
-                                             shdr.sh_flags, entsize, addralign);
-  m->p2align = sec.p2align;
-
-  // If thes section contents are compressed, uncompress them.
-  sec.uncompress(ctx);
-
-  std::string_view data = sec.contents;
-  m->contents = sec.contents;
-
-  if (data.size() > UINT32_MAX)
-    Fatal(ctx) << sec << ": mergeable section too large";
-
-  // Split sections
-  if (shdr.sh_flags & SHF_STRINGS) {
-    for (i64 pos = 0; pos < data.size();) {
-      m->frag_offsets.push_back(pos);
-      size_t end = find_null(data, pos, entsize);
-      if (end == data.npos)
-        Fatal(ctx) << sec << ": string is not null terminated";
-      pos = end + entsize;
-    }
-  } else {
-    if (data.size() % entsize)
-      Fatal(ctx) << sec << ": section size is not multiple of sh_entsize";
-    m->frag_offsets.reserve(data.size() / entsize);
+void ObjectFile<E>::convert_mergeable_sections(Context<E> &ctx) {
+  // Convert InputSections to MergeableSections
+  for (i64 i = 0; i < this->sections.size(); i++) {
+    InputSection<E> *isec = this->sections[i].get();
+    if (!isec || isec->sh_size == 0 || isec->relsec_idx != -1)
+      continue;
 
-    for (i64 pos = 0; pos < data.size(); pos += entsize)
-      m->frag_offsets.push_back(pos);
-  }
+    const ElfShdr<E> &shdr = isec->shdr();
+    if (!(shdr.sh_flags & SHF_MERGE))
+      continue;
 
-  // Compute hashes for section pieces
-  HyperLogLog estimator;
-  m->hashes.reserve(m->frag_offsets.size());
+    MergedSection<E> *parent =
+      MergedSection<E>::get_instance(ctx, isec->name(), shdr);
 
-  for (i64 i = 0; i < m->frag_offsets.size(); i++) {
-    u64 hash = hash_string(m->get_contents(i));
-    m->hashes.push_back(hash);
-    estimator.insert(hash);
+    if (parent) {
+      this->mergeable_sections[i] =
+        std::make_unique<MergeableSection<E>>(ctx, *parent, this->sections[i]);
+      this->sections[i] = nullptr;
+    }
   }
-
-  m->parent->estimator.merge(estimator);
-
-  static Counter counter("string_fragments");
-  counter += m->frag_offsets.size();
-  return m;
 }
 
 // Usually a section is an atomic unit of inclusion or exclusion.
@@ -811,43 +740,17 @@ split_section(Context<E> &ctx, InputSection<E> &sec) {
 // section piece in a section, but it doesn't do for any other types
 // of symbols.
 //
-// In mold, we attach symbols to section pieces. If a relocation refers
-// to a section symbol, and that symbol's section is a mergeable one,
-// we create a new dummy symbol for a section piece and redirect the
-// relocation to this new symbol. If a non-section symbol refers to a
-// section piece, the section piece is attached to the symbol.
-template <typename E>
-void ObjectFile<E>::initialize_mergeable_sections(Context<E> &ctx) {
-  mergeable_sections.resize(sections.size());
-
-  for (i64 i = 0; i < sections.size(); i++) {
-    if (std::unique_ptr<InputSection<E>> &isec = sections[i]) {
-      if (std::unique_ptr<MergeableSection<E>> m = split_section(ctx, *isec)) {
-        mergeable_sections[i] = std::move(m);
-        isec->is_alive = false;
-      }
-    }
-  }
-}
-
+// Section garbage collection and Identical Code Folding work on graphs
+// where sections or section pieces are vertices and relocations are
+// edges. To make it easy to handle them, we rewrite symbols and
+// relocations so that each non-absolute symbol always refers to either
+// a non-mergeable section or a section piece.
+//
+// We do that only for SHF_ALLOC sections because GC and ICF work only
+// on memory-allocated sections. Non-memory-allocated mergeable sections
+// are not handled here for performance reasons.
 template <typename E>
-void ObjectFile<E>::resolve_section_pieces(Context<E> &ctx) {
-  for (std::unique_ptr<MergeableSection<E>> &m : mergeable_sections) {
-    if (m) {
-      m->fragments.reserve(m->frag_offsets.size());
-
-      for (i64 i = 0; i < m->frag_offsets.size(); i++) {
-        SectionFragment<E> *frag =
-          m->parent->insert(ctx, m->get_contents(i), m->hashes[i], m->p2align);
-        m->fragments.push_back(frag);
-      }
-
-      // Reclaim memory as we'll never use this vector again
-      m->hashes.clear();
-      m->hashes.shrink_to_fit();
-    }
-  }
-
+void ObjectFile<E>::reattach_section_pieces(Context<E> &ctx) {
   // Attach section pieces to symbols.
   for (i64 i = 1; i < this->elf_syms.size(); i++) {
     Symbol<E> &sym = *this->symbols[i];
@@ -856,8 +759,9 @@ void ObjectFile<E>::resolve_section_pieces(Context<E> &ctx) {
     if (esym.is_abs() || esym.is_common() || esym.is_undef())
       continue;
 
-    std::unique_ptr<MergeableSection<E>> &m = mergeable_sections[get_shndx(esym)];
-    if (!m || m->fragments.empty())
+    i64 shndx = get_shndx(esym);
+    std::unique_ptr<MergeableSection<E>> &m = mergeable_sections[shndx];
+    if (!m || !m->parent.resolved)
       continue;
 
     SectionFragment<E> *frag;
@@ -874,49 +778,51 @@ void ObjectFile<E>::resolve_section_pieces(Context<E> &ctx) {
   // Compute the size of frag_syms.
   i64 nfrag_syms = 0;
   for (std::unique_ptr<InputSection<E>> &isec : sections)
-    if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC))
+    if (isec && (isec->shdr().sh_flags & SHF_ALLOC))
       for (ElfRel<E> &r : isec->get_rels(ctx))
         if (const ElfSym<E> &esym = this->elf_syms[r.r_sym];
-            esym.st_type == STT_SECTION && mergeable_sections[get_shndx(esym)])
-          nfrag_syms++;
+            esym.st_type == STT_SECTION)
+          if (mergeable_sections[get_shndx(esym)])
+            nfrag_syms++;
 
   this->frag_syms.resize(nfrag_syms);
 
-  // For each relocation referring a mergeable section symbol, we create
-  // a new dummy non-section symbol and redirect the relocation to the
-  // newly-created symbol.
+  // For each relocation referring to a mergeable section symbol, we
+  // create a new dummy non-section symbol and redirect the relocation
+  // to the newly created symbol.
   i64 idx = 0;
   for (std::unique_ptr<InputSection<E>> &isec : sections) {
-    if (!isec || !isec->is_alive || !(isec->shdr().sh_flags & SHF_ALLOC))
-      continue;
-
-    for (ElfRel<E> &r : isec->get_rels(ctx)) {
-      const ElfSym<E> &esym = this->elf_syms[r.r_sym];
-      if (esym.st_type != STT_SECTION)
-        continue;
-
-      std::unique_ptr<MergeableSection<E>> &m = mergeable_sections[get_shndx(esym)];
-      if (!m)
-        continue;
-
-      i64 r_addend = get_addend(*isec, r);
-
-      SectionFragment<E> *frag;
-      i64 in_frag_offset;
-      std::tie(frag, in_frag_offset) = m->get_fragment(esym.st_value + r_addend);
+    if (isec && (isec->shdr().sh_flags & SHF_ALLOC)) {
+      for (ElfRel<E> &r : isec->get_rels(ctx)) {
+        const ElfSym<E> &esym = this->elf_syms[r.r_sym];
+        if (esym.st_type != STT_SECTION)
+          continue;
 
-      if (!frag)
-        Fatal(ctx) << *this << ": bad relocation at " << r.r_sym;
+        i64 shndx = get_shndx(esym);
+        std::unique_ptr<MergeableSection<E>> &m = mergeable_sections[shndx];
+        if (!m)
+          continue;
 
-      Symbol<E> &sym = this->frag_syms[idx];
-      sym.file = this;
-      sym.set_name("<fragment>");
-      sym.sym_idx = r.r_sym;
-      sym.visibility = STV_HIDDEN;
-      sym.set_frag(frag);
-      sym.value = in_frag_offset - r_addend;
-      r.r_sym = this->elf_syms.size() + idx;
-      idx++;
+        assert(m->parent.resolved);
+
+        i64 r_addend = get_addend(*isec, r);
+        SectionFragment<E> *frag;
+        i64 in_frag_offset;
+        std::tie(frag, in_frag_offset) = m->get_fragment(esym.st_value + r_addend);
+
+        if (!frag)
+          Fatal(ctx) << *this << ": bad relocation at " << r.r_sym;
+
+        Symbol<E> &sym = this->frag_syms[idx];
+        sym.file = this;
+        sym.set_name("<fragment>");
+        sym.sym_idx = r.r_sym;
+        sym.visibility = STV_HIDDEN;
+        sym.set_frag(frag);
+        sym.value = in_frag_offset - r_addend;
+        r.r_sym = this->elf_syms.size() + idx;
+        idx++;
+      }
     }
   }
 
@@ -929,6 +835,8 @@ void ObjectFile<E>::resolve_section_pieces(Context<E> &ctx) {
 template <typename E>
 void ObjectFile<E>::parse(Context<E> &ctx) {
   sections.resize(this->elf_sections.size());
+  mergeable_sections.resize(sections.size());
+
   symtab_sec = this->find_section(SHT_SYMTAB);
 
   if (symtab_sec) {
@@ -945,7 +853,6 @@ void ObjectFile<E>::parse(Context<E> &ctx) {
   initialize_sections(ctx);
   initialize_symbols(ctx);
   sort_relocations(ctx);
-  parse_ehframe(ctx);
 }
 
 // Symbols with higher priorities overwrites symbols with lower priorities.
@@ -1142,8 +1049,6 @@ void ObjectFile<E>::convert_common_symbols(Context<E> &ctx) {
       continue;
 
     Symbol<E> &sym = *this->symbols[i];
-    std::scoped_lock lock(sym.mu);
-
     if (sym.file != this) {
       if (ctx.arg.warn_common)
         Warn(ctx) << *this << ": multiple common symbols: " << sym;
@@ -1164,7 +1069,6 @@ void ObjectFile<E>::convert_common_symbols(Context<E> &ctx) {
     i64 idx = this->elf_sections.size() + elf_sections2.size() - 1;
     auto isec = std::make_unique<InputSection<E>>(ctx, *this, idx);
 
-    sym.file = this;
     sym.set_input_section(isec.get());
     sym.value = 0;
     sym.sym_idx = i;
@@ -1199,9 +1103,6 @@ static bool should_write_to_local_symtab(Context<E> &ctx, Symbol<E> &sym) {
 
 template <typename E>
 void ObjectFile<E>::compute_symtab_size(Context<E> &ctx) {
-  if (ctx.arg.strip_all)
-    return;
-
   this->output_sym_indices.resize(this->elf_syms.size(), -1);
 
   auto is_alive = [&](Symbol<E> &sym) -> bool {
@@ -1299,12 +1200,6 @@ SharedFile<E> *SharedFile<E>::create(Context<E> &ctx, MappedFile *mf) {
   return obj;
 }
 
-template <typename E>
-SharedFile<E>::SharedFile(Context<E> &ctx, MappedFile *mf)
-  : InputFile<E>(ctx, mf) {
-  this->is_alive = !ctx.as_needed;
-}
-
 template <typename E>
 std::string SharedFile<E>::get_soname(Context<E> &ctx) {
   if (ElfShdr<E> *sec = this->find_section(SHT_DYNAMIC))
@@ -1367,6 +1262,32 @@ void SharedFile<E>::parse(Context<E> &ctx) {
   counter += this->elf_syms.size();
 }
 
+template <typename E>
+std::vector<std::string_view> SharedFile<E>::get_dt_needed(Context<E> &ctx) {
+  // Get the contents of the dynamic segment
+  std::span<Word<E>> dynamic;
+  for (ElfPhdr<E> &phdr : this->get_phdrs())
+    if (phdr.p_type == PT_DYNAMIC)
+      dynamic = {(Word<E> *)(this->mf->data + phdr.p_offset),
+                 (size_t)phdr.p_memsz / sizeof(Word<E>)};
+
+  // Find a string table
+  char *strtab = nullptr;
+  for (i64 i = 0; i < dynamic.size(); i += 2)
+    if (dynamic[i] == DT_STRTAB)
+      strtab = (char *)this->mf->data + dynamic[i + 1];
+
+  if (!strtab)
+    return {};
+
+  // Find all DT_NEEDED entries
+  std::vector<std::string_view> vec;
+  for (i64 i = 0; i < dynamic.size(); i += 2)
+    if (dynamic[i] == DT_NEEDED)
+      vec.push_back(strtab + dynamic[i + 1]);
+  return vec;
+}
+
 // Symbol versioning is a GNU extension to the ELF file format. I don't
 // particularly like the feature as it complicates the semantics of
 // dynamic linking, but we need to support it anyway because it is
@@ -1431,7 +1352,8 @@ void SharedFile<E>::resolve_symbols(Context<E> &ctx) {
   for (i64 i = 0; i < this->symbols.size(); i++) {
     Symbol<E> &sym = *this->symbols[i];
     const ElfSym<E> &esym = this->elf_syms[i];
-    if (esym.is_undef())
+
+    if (esym.is_undef() || sym.skip_dso)
       continue;
 
     std::scoped_lock lock(sym.mu);
@@ -1458,7 +1380,7 @@ SharedFile<E>::mark_live_objects(Context<E> &ctx,
     if (sym.is_traced)
       print_trace_symbol(ctx, *this, esym, sym);
 
-    if (esym.is_undef() && !esym.is_weak() && sym.file && !sym.file->is_dso &&
+    if (esym.is_undef() && !esym.is_weak() && sym.file &&
         !sym.file->is_alive.test_and_set()) {
       feeder(sym.file);
 
@@ -1524,9 +1446,6 @@ bool SharedFile<E>::is_readonly(Symbol<E> *sym) {
 
 template <typename E>
 void SharedFile<E>::compute_symtab_size(Context<E> &ctx) {
-  if (ctx.arg.strip_all)
-    return;
-
   this->output_sym_indices.resize(this->elf_syms.size(), -1);
 
   // Compute the size of global symbols.
@@ -1576,4 +1495,4 @@ template std::string_view demangle(const Symbol<E> &);
 template std::ostream &operator<<(std::ostream &, const Symbol<E> &);
 template std::ostream &operator<<(std::ostream &, const InputFile<E> &);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/input-sections.cc b/src/input-sections.cc
similarity index 51%
rename from elf/input-sections.cc
rename to src/input-sections.cc
index 8383a7fd..399c80a9 100644
--- a/elf/input-sections.cc
+++ b/src/input-sections.cc
@@ -4,12 +4,7 @@
 #include <zlib.h>
 #include <zstd.h>
 
-namespace mold::elf {
-
-typedef enum {
-  NONE, ERROR, COPYREL, DYN_COPYREL, PLT, CPLT, DYN_CPLT, DYNREL,
-  BASEREL, IFUNC_DYNREL,
-} Action;
+namespace mold {
 
 static i64 to_p2align(u64 alignment) {
   if (alignment == 0)
@@ -106,125 +101,31 @@ void InputSection<E>::copy_contents(Context<E> &ctx, u8 *buf) {
   }
 }
 
-template <typename E>
-static bool
-is_relr_reloc(Context<E> &ctx, InputSection<E> &isec, const ElfRel<E> &rel) {
-  ElfShdr<E> shdr = isec.shdr();
-  return ctx.arg.pack_dyn_relocs_relr &&
-         !(shdr.sh_flags & SHF_EXECINSTR) &&
-         shdr.sh_addralign % sizeof(Word<E>) == 0 &&
-         rel.r_offset % sizeof(Word<E>) == 0;
-}
+typedef enum : u8 { NONE, ERROR, COPYREL, PLT, CPLT } Action;
 
 template <typename E>
-static void scan_rel(Context<E> &ctx, InputSection<E> &isec, Symbol<E> &sym,
-                     const ElfRel<E> &rel, Action action) {
-  bool writable = (isec.shdr().sh_flags & SHF_WRITE);
-
-  auto error = [&] {
-    std::string msg = sym.is_absolute() ? "-fno-PIC" : "-fPIC";
-    Error(ctx) << isec << ": " << rel << " relocation at offset 0x"
-               << std::hex << rel.r_offset << " against symbol `"
-               << sym << "' can not be used; recompile with " << msg;
-  };
-
-  auto check_textrel = [&] {
-    if (!writable) {
-      if (ctx.arg.z_text) {
-        error();
-      } else if (ctx.arg.warn_textrel) {
-        Warn(ctx) << isec << ": relocation against symbol `" << sym
-                  << "' in read-only section";
-      }
-      ctx.has_textrel = true;
-    }
-  };
-
-  auto copyrel = [&] {
-    assert(sym.is_imported);
-    if (sym.esym().st_visibility == STV_PROTECTED) {
-      Error(ctx) << isec
-                 << ": cannot make copy relocation for protected symbol '" << sym
-                 << "', defined in " << *sym.file << "; recompile with -fPIC";
-    }
-    sym.flags |= NEEDS_COPYREL;
-  };
-
-  auto dynrel = [&] {
-    check_textrel();
-    isec.file.num_dynrel++;
-  };
-
+static void do_action(Context<E> &ctx, Action action, InputSection<E> &isec,
+                      Symbol<E> &sym, const ElfRel<E> &rel) {
   switch (action) {
   case NONE:
     break;
   case ERROR:
-    // Print out the "recompile with -fPIC" error message.
-    error();
+    Error(ctx) << isec << ": " << rel << " relocation at offset 0x"
+               << std::hex << rel.r_offset << " against symbol `"
+               << sym << "' can not be used; recompile with -fPIC";
     break;
   case COPYREL:
-    // Create a copy relocation.
-    if (!ctx.arg.z_copyreloc)
-      error();
-    copyrel();
-    break;
-  case DYN_COPYREL:
-    // Same as COPYREL but try to avoid creating a copy relocation by
-    // creating a dynamic relocation instead if the relocation is in
-    // a writable section.
-    //
-    // GHC (Glasgow Haskell Compiler) places a small amount of data in
-    // .text before each function and access that data with a fixed
-    // offset. The function breaks if we copy-relocate the data. For such
-    // programs, we should avoid copy relocations if possible.
-    //
-    // Besides GHC, copy relocation is a hacky solution, so if we can
-    // represent a relocation either with copyrel or dynrel, we prefer
-    // dynamic relocation.
-    if (writable || !ctx.arg.z_copyreloc)
-      dynrel();
-    else
-      copyrel();
+    // Create a copy relocation
+    sym.flags |= NEEDS_COPYREL;
     break;
   case PLT:
-    // Create a PLT entry.
+    // Create a PLT entry
     sym.flags |= NEEDS_PLT;
     break;
   case CPLT:
-    // Create a canonical PLT entry.
+    // Create a canonical PLT entry
     sym.flags |= NEEDS_CPLT;
     break;
-  case DYN_CPLT:
-    // Same as CPLT but try to avoid creating a canonical PLT creating by
-    // creating a dynamic relocation instead if the relocation is in a
-    // writable section. The motivation behind it is hte same as DYN_COPYREL.
-    if (writable)
-      dynrel();
-    else
-      sym.flags |= NEEDS_CPLT;
-    break;
-  case DYNREL:
-    // Create a dynamic relocation.
-    dynrel();
-    break;
-  case BASEREL:
-    // Create a base relocation.
-    check_textrel();
-    if (!is_relr_reloc(ctx, isec, rel))
-      isec.file.num_dynrel++;
-    break;
-  case IFUNC_DYNREL:
-    // Create an IRELATIVE relocation for a GNU ifunc symbol.
-    //
-    // We usually create an IRELATIVE relocation in .got for each ifunc.
-    // However, if a statically-initialized pointer is initialized to an
-    // ifunc's address, we have no choice other than emitting an IRELATIVE
-    // relocation for each such pointer.
-    dynrel();
-    ctx.num_ifunc_dynrels++;
-    break;
-  default:
-    unreachable();
   }
 }
 
@@ -249,102 +150,44 @@ static inline i64 get_sym_type(Symbol<E> &sym) {
 }
 
 template <typename E>
-static Action get_pcrel_action(Context<E> &ctx, Symbol<E> &sym) {
+void InputSection<E>::scan_pcrel(Context<E> &ctx, Symbol<E> &sym,
+                                 const ElfRel<E> &rel) {
   // This is for PC-relative relocations (e.g. R_X86_64_PC32).
   // We cannot promote them to dynamic relocations because the dynamic
   // linker generally does not support PC-relative relocations.
-  static Action table[3][4] = {
+  static Action table[][4] = {
     // Absolute  Local    Imported data  Imported code
     {  ERROR,    NONE,    ERROR,         PLT    },  // Shared object
-    {  ERROR,    NONE,    COPYREL,       PLT    },  // Position-independent exec
+    {  ERROR,    NONE,    COPYREL,       CPLT   },  // Position-independent exec
     {  NONE,     NONE,    COPYREL,       CPLT   },  // Position-dependent exec
   };
 
-  return table[get_output_type(ctx)][get_sym_type(sym)];
+  Action action = table[get_output_type(ctx)][get_sym_type(sym)];
+  do_action(ctx, action, *this, sym, rel);
 }
 
 template <typename E>
-static Action get_absrel_action(Context<E> &ctx, Symbol<E> &sym) {
+void InputSection<E>::scan_absrel(Context<E> &ctx, Symbol<E> &sym,
+                                  const ElfRel<E> &rel) {
   // This is a decision table for absolute relocations that is smaller
   // than the pointer size (e.g. R_X86_64_32). Since the dynamic linker
   // generally does not support dynamic relocations smaller than the
   // pointer size, we need to report an error if a relocation cannot be
   // resolved at link-time.
-  static Action table[3][4] = {
+  static Action table[][4] = {
     // Absolute  Local    Imported data  Imported code
     {  NONE,     ERROR,   ERROR,         ERROR },  // Shared object
     {  NONE,     ERROR,   ERROR,         ERROR },  // Position-independent exec
     {  NONE,     NONE,    COPYREL,       CPLT  },  // Position-dependent exec
   };
 
-  return table[get_output_type(ctx)][get_sym_type(sym)];
-}
-
-template <typename E>
-static Action get_dyn_absrel_action(Context<E> &ctx, Symbol<E> &sym) {
-  if (sym.is_ifunc())
-    return sym.is_pde_ifunc(ctx) ? NONE : IFUNC_DYNREL;
-
-  // This is a decision table for absolute relocations for the pointer
-  // size data (e.g. R_X86_64_64). Unlike the absrel_table, we can emit
-  // a dynamic relocation if we cannot resolve an address at link-time.
-  static Action table[3][4] = {
-    // Absolute  Local    Imported data  Imported code
-    {  NONE,     BASEREL, DYNREL,        DYNREL   },  // Shared object
-    {  NONE,     BASEREL, DYNREL,        DYNREL   },  // Position-independent exec
-    {  NONE,     NONE,    DYN_COPYREL,   DYN_CPLT },  // Position-dependent exec
-  };
-
-  return table[get_output_type(ctx)][get_sym_type(sym)];
-}
-
-template <typename E>
-static Action get_ppc64_toc_action(Context<E> &ctx, Symbol<E> &sym) {
-  if (sym.is_ifunc())
-    return IFUNC_DYNREL;
-
-  // As a special case, we do not create copy relocations nor canonical
-  // PLTs for .toc sections. PPC64's .toc is a compiler-generated
-  // GOT-like section, and no user-generated code directly uses values
-  // in it.
-  static Action table[3][4] = {
-    // Absolute  Local    Imported data  Imported code
-    {  NONE,     BASEREL, DYNREL,        DYNREL },  // Shared object
-    {  NONE,     BASEREL, DYNREL,        DYNREL },  // Position-independent exec
-    {  NONE,     NONE,    DYNREL,        DYNREL },  // Position-dependent exec
-  };
-
-  return table[get_output_type(ctx)][get_sym_type(sym)];
-}
-
-template <typename E>
-void InputSection<E>::scan_pcrel(Context<E> &ctx, Symbol<E> &sym,
-                                 const ElfRel<E> &rel) {
-  scan_rel(ctx, *this, sym, rel, get_pcrel_action(ctx, sym));
-}
-
-template <typename E>
-void InputSection<E>::scan_absrel(Context<E> &ctx, Symbol<E> &sym,
-                                  const ElfRel<E> &rel) {
-  scan_rel(ctx, *this, sym, rel, get_absrel_action(ctx, sym));
-}
-
-template <typename E>
-void InputSection<E>::scan_dyn_absrel(Context<E> &ctx, Symbol<E> &sym,
-                                      const ElfRel<E> &rel) {
-  scan_rel(ctx, *this, sym, rel, get_dyn_absrel_action(ctx, sym));
-}
-
-template <typename E>
-void InputSection<E>::scan_toc_rel(Context<E> &ctx, Symbol<E> &sym,
-                                   const ElfRel<E> &rel) {
-  scan_rel(ctx, *this, sym, rel, get_ppc64_toc_action(ctx, sym));
+  Action action = table[get_output_type(ctx)][get_sym_type(sym)];
+  do_action(ctx, action, *this, sym, rel);
 }
 
 template <typename E>
 void InputSection<E>::scan_tlsdesc(Context<E> &ctx, Symbol<E> &sym) {
-  if (ctx.arg.is_static ||
-      (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) {
+  if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) {
     // Relax TLSDESC to Local Exec. In this case, we directly materialize
     // a TP-relative offset, so no dynamic relocation is needed.
     //
@@ -373,92 +216,41 @@ void InputSection<E>::check_tlsle(Context<E> &ctx, Symbol<E> &sym,
                << " recompile with -fPIC";
 }
 
-template <typename E>
-static void apply_absrel(Context<E> &ctx, InputSection<E> &isec,
-                         Symbol<E> &sym, const ElfRel<E> &rel, u8 *loc,
-                         u64 S, i64 A, u64 P, ElfRel<E> *&dynrel,
-                         Action action) {
-  bool writable = (isec.shdr().sh_flags & SHF_WRITE);
-
-  auto emit_abs_dynrel = [&] {
-    *dynrel++ = ElfRel<E>(P, E::R_ABS, sym.get_dynsym_idx(ctx), A);
-    if (ctx.arg.apply_dynamic_relocs)
-      *(Word<E> *)loc = A;
-  };
-
-  switch (action) {
-  case COPYREL:
-  case CPLT:
-  case NONE:
-    *(Word<E> *)loc = S + A;
-    break;
-  case BASEREL:
-    if (is_relr_reloc(ctx, isec, rel)) {
-      *(Word<E> *)loc = S + A;
-    } else {
-      *dynrel++ = ElfRel<E>(P, E::R_RELATIVE, 0, S + A);
-      if (ctx.arg.apply_dynamic_relocs)
-        *(Word<E> *)loc = S + A;
-    }
-    break;
-  case DYN_COPYREL:
-    if (writable || !ctx.arg.z_copyreloc)
-      emit_abs_dynrel();
-    else
-      *(Word<E> *)loc = S + A;
-    break;
-  case DYN_CPLT:
-    if (writable)
-      emit_abs_dynrel();
-    else
-      *(Word<E> *)loc = S + A;
-    break;
-  case DYNREL:
-    emit_abs_dynrel();
-    break;
-  case IFUNC_DYNREL:
-    if constexpr (supports_ifunc<E>) {
-      u64 addr = sym.get_addr(ctx, NO_PLT) + A;
-      *dynrel++ = ElfRel<E>(P, E::R_IRELATIVE, 0, addr);
-      if (ctx.arg.apply_dynamic_relocs)
-        *(Word<E> *)loc = addr;
-    } else {
-      unreachable();
-    }
-    break;
-  default:
-    unreachable();
-  }
-}
-
-template <typename E>
-void InputSection<E>::apply_dyn_absrel(Context<E> &ctx, Symbol<E> &sym,
-                                       const ElfRel<E> &rel, u8 *loc,
-                                       u64 S, i64 A, u64 P,
-                                       ElfRel<E> **dynrel) {
-  apply_absrel(ctx, *this, sym, rel, loc, S, A, P, *dynrel,
-               get_dyn_absrel_action(ctx, sym));
-}
-
-template <typename E>
-void InputSection<E>::apply_toc_rel(Context<E> &ctx, Symbol<E> &sym,
-                                    const ElfRel<E> &rel, u8 *loc,
-                                    u64 S, i64 A, u64 P,
-                                    ElfRel<E> **dynrel) {
-  apply_absrel(ctx, *this, sym, rel, loc, S, A, P, *dynrel,
-               get_ppc64_toc_action(ctx, sym));
-}
-
 template <typename E>
 void InputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
   if (shdr().sh_type == SHT_NOBITS || sh_size == 0)
     return;
 
-  // Copy data
-  if constexpr (is_riscv<E>)
-    copy_contents_riscv(ctx, buf);
-  else
+  // Copy data. In RISC-V and LoongArch object files, sections are not
+  // atomic unit of copying because of relaxation. That is, some
+  // relocations are allowed to remove bytes from the middle of a
+  // section and shrink the overall size of it.
+  if constexpr (is_riscv<E> || is_loongarch<E>) {
+    if (extra.r_deltas.empty()) {
+      // If a section is not relaxed, we can copy it as a one big chunk.
+      copy_contents(ctx, buf);
+    } else {
+      // A relaxed section is copied piece-wise.
+      std::span<const ElfRel<E>> rels = get_rels(ctx);
+      u8 *buf2 = buf;
+      i64 pos = 0;
+
+      for (i64 i = 0; i < rels.size(); i++) {
+        i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i];
+        if (delta == 0)
+          continue;
+        assert(delta > 0);
+
+        const ElfRel<E> &r = rels[i];
+        memcpy(buf2, contents.data() + pos, r.r_offset - pos);
+        buf2 += r.r_offset - pos;
+        pos = r.r_offset + delta;
+      }
+      memcpy(buf2, contents.data() + pos, contents.size() - pos);
+    }
+  } else {
     copy_contents(ctx, buf);
+  }
 
   // Apply relocations
   if (!ctx.arg.relocatable) {
@@ -474,12 +266,14 @@ template <typename E>
 std::string_view
 InputSection<E>::get_func_name(Context<E> &ctx, i64 offset) const {
   for (Symbol<E> *sym : file.symbols) {
-    const ElfSym<E> &esym = sym->esym();
-    if (esym.st_shndx == shndx && esym.st_type == STT_FUNC &&
-        esym.st_value <= offset && offset < esym.st_value + esym.st_size) {
-      if (ctx.arg.demangle)
-        return demangle(*sym);
-      return sym->name();
+    if (sym->file == &file) {
+      const ElfSym<E> &esym = sym->esym();
+      if (esym.st_shndx == shndx && esym.st_type == STT_FUNC &&
+          esym.st_value <= offset && offset < esym.st_value + esym.st_size) {
+        if (ctx.arg.demangle)
+          return demangle(*sym);
+        return sym->name();
+      }
     }
   }
   return "";
@@ -530,6 +324,7 @@ bool InputSection<E>::record_undef_error(Context<E> &ctx, const ElfRel<E> &rel)
   // Every ELF file has an absolute local symbol as its first symbol.
   // Referring to that symbol is always valid.
   bool is_undef = esym.is_undef() && !esym.is_weak() && sym.sym_idx;
+
   if (is_undef && sym.esym().is_undef()) {
     if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR && !sym.is_imported) {
       record();
@@ -541,20 +336,105 @@ bool InputSection<E>::record_undef_error(Context<E> &ctx, const ElfRel<E> &rel)
     }
   }
 
-  // If a protected/hidden undefined symbol is resolved to other .so,
-  // it's handled as if no symbols were found.
-  if (sym.file->is_dso &&
-      (sym.visibility == STV_PROTECTED || sym.visibility == STV_HIDDEN)) {
-    record();
-    return true;
+  return false;
+}
+
+template <typename E>
+MergeableSection<E>::MergeableSection(Context<E> &ctx, MergedSection<E> &parent,
+                                      std::unique_ptr<InputSection<E>> &isec)
+  : parent(parent), section(std::move(isec)), p2align(section->p2align) {
+  section->uncompress(ctx);
+
+  std::scoped_lock lock(parent.mu);
+  parent.members.push_back(this);
+}
+
+static size_t find_null(std::string_view data, i64 pos, i64 entsize) {
+  if (entsize == 1)
+    return data.find('\0', pos);
+
+  for (; pos <= data.size() - entsize; pos += entsize)
+    if (data.substr(pos, entsize).find_first_not_of('\0') == data.npos)
+      return pos;
+
+  return data.npos;
+}
+
+// Mergeable sections (sections with SHF_MERGE bit) typically contain
+// string literals. Linker is expected to split the section contents
+// into null-terminated strings, merge them with mergeable strings
+// from other object files, and emit uniquified strings to an output
+// file.
+//
+// This mechanism reduces the size of an output file. If two source
+// files happen to contain the same string literal, the output will
+// contain only a single copy of it.
+//
+// It is less common than string literals, but mergeable sections can
+// contain fixed-sized read-only records too.
+//
+// This function splits the section contents into small pieces that we
+// call "section fragments". Section fragment is a unit of merging.
+//
+// We do not support mergeable sections that have relocations.
+template <typename E>
+void MergeableSection<E>::split_contents(Context<E> &ctx) {
+  std::string_view data = section->contents;
+  if (data.size() > UINT32_MAX)
+    Fatal(ctx) << *section
+               << ": mergeable section too large";
+
+  i64 entsize = parent.shdr.sh_entsize;
+
+  // Split sections
+  if (parent.shdr.sh_flags & SHF_STRINGS) {
+    for (i64 pos = 0; pos < data.size();) {
+      frag_offsets.push_back(pos);
+      size_t end = find_null(data, pos, entsize);
+      if (end == data.npos)
+        Fatal(ctx) << *section << ": string is not null terminated";
+      pos = end + entsize;
+    }
+  } else {
+    if (data.size() % entsize)
+      Fatal(ctx) << *section << ": section size is not multiple of sh_entsize";
+    frag_offsets.reserve(data.size() / entsize);
+
+    for (i64 pos = 0; pos < data.size(); pos += entsize)
+      frag_offsets.push_back(pos);
   }
 
-  return false;
+  // Compute hashes for section pieces
+  HyperLogLog estimator;
+  hashes.reserve(frag_offsets.size());
+
+  for (i64 i = 0; i < frag_offsets.size(); i++) {
+    u64 hash = hash_string(get_contents(i));
+    hashes.push_back(hash);
+    estimator.insert(hash);
+  }
+
+  parent.estimator.merge(estimator);
+
+  static Counter counter("string_fragments");
+  counter += frag_offsets.size();
+}
+
+template <typename E>
+void MergeableSection<E>::resolve_contents(Context<E> &ctx) {
+  fragments.reserve(frag_offsets.size());
+  for (i64 i = 0; i < frag_offsets.size(); i++)
+    fragments.push_back(parent.insert(ctx, get_contents(i), hashes[i], p2align));
+
+  // Reclaim memory as we'll never use this vector again
+  hashes.clear();
+  hashes.shrink_to_fit();
 }
 
 using E = MOLD_TARGET;
 
 template bool cie_equals(const CieRecord<E> &, const CieRecord<E> &);
 template class InputSection<E>;
+template class MergeableSection<E>;
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/linker-script.cc b/src/linker-script.cc
similarity index 51%
rename from elf/linker-script.cc
rename to src/linker-script.cc
index 28aedad9..6fe5dab6 100644
--- a/elf/linker-script.cc
+++ b/src/linker-script.cc
@@ -8,10 +8,7 @@
 #include <cctype>
 #include <iomanip>
 
-namespace mold::elf {
-
-template <typename E>
-void read_version_script(Context<E> &ctx, std::span<std::string_view> &tok);
+namespace mold {
 
 static std::string_view get_line(std::string_view input, const char *pos) {
   assert(input.data() <= pos);
@@ -31,40 +28,27 @@ static std::string_view get_line(std::string_view input, const char *pos) {
 }
 
 template <typename E>
-class SyntaxError {
-public:
-  SyntaxError(Context<E> &ctx, std::string_view errpos) : out(ctx) {
-    std::string_view contents = ctx.script_file->get_contents();
-    std::string_view line = get_line(contents, errpos.data());
-
-    i64 lineno = 1;
-    for (i64 i = 0; contents.data() + i < line.data(); i++)
-      if (contents[i] == '\n')
-        lineno++;
-
-    std::string label = ctx.script_file->name + ":" +
-                        std::to_string(lineno) + ": ";
-    i64 indent = strlen("mold: fatal: ") + label.size();
-    i64 column = errpos.data() - line.data();
-
-    out << label << line << "\n"
-        << std::string(indent + column, ' ') << "^ ";
-  }
+void Script<E>::error(std::string_view pos, std::string msg) {
+  std::string_view input = mf->get_contents();
+  std::string_view line = get_line(input, pos.data());
 
-  template <typename T> SyntaxError &operator<<(T &&val) {
-    out << std::forward<T>(val);
-    return *this;
-  }
+  i64 lineno = 1;
+  for (i64 i = 0; input.data() + i < line.data(); i++)
+    if (input[i] == '\n')
+      lineno++;
 
-  [[noreturn]] ~SyntaxError() = default;
+  std::string label = mf->name + ":" + std::to_string(lineno) + ": ";
+  i64 indent = strlen("mold: fatal: ") + label.size();
+  i64 column = pos.data() - line.data();
 
-  Fatal<Context<E>> out;
-};
+  Fatal(ctx) << label << line << "\n"
+             << std::string(indent + column, ' ') << "^ " << msg;
+}
 
 template <typename E>
-static std::vector<std::string_view>
-tokenize(Context<E> &ctx, std::string_view input) {
-  std::vector<std::string_view> vec;
+void Script<E>::tokenize() {
+  std::string_view input = mf->get_contents();
+
   while (!input.empty()) {
     if (isspace(input[0])) {
       input = input.substr(1);
@@ -74,7 +58,7 @@ tokenize(Context<E> &ctx, std::string_view input) {
     if (input.starts_with("/*")) {
       i64 pos = input.find("*/", 2);
       if (pos == std::string_view::npos)
-        SyntaxError(ctx, input) << "unclosed comment";
+        error(input, "unclosed comment");
       input = input.substr(pos + 2);
       continue;
     }
@@ -90,8 +74,8 @@ tokenize(Context<E> &ctx, std::string_view input) {
     if (input[0] == '"') {
       i64 pos = input.find('"', 1);
       if (pos == std::string_view::npos)
-        SyntaxError(ctx, input) << "unclosed string literal";
-      vec.push_back(input.substr(0, pos + 1));
+        error(input, "unclosed string literal");
+      tokens.push_back(input.substr(0, pos + 1));
       input = input.substr(pos + 1);
       continue;
     }
@@ -105,20 +89,18 @@ tokenize(Context<E> &ctx, std::string_view input) {
     else if (pos == input.npos)
       pos = input.size();
 
-    vec.push_back(input.substr(0, pos));
+    tokens.push_back(input.substr(0, pos));
     input = input.substr(pos);
   }
-  return vec;
 }
 
 template <typename E>
-static std::span<std::string_view>
-skip(Context<E> &ctx, std::span<std::string_view> tok, std::string_view str) {
+std::span<std::string_view>
+Script<E>::skip(std::span<std::string_view> tok, std::string_view str) {
   if (tok.empty())
-    Fatal(ctx) << ctx.script_file->name << ": expected '" << str
-               << "', but got EOF";
+    Fatal(ctx) << mf->name << ": expected '" << str << "', but got EOF";
   if (tok[0] != str)
-    SyntaxError(ctx, tok[0]) << "expected '" << str << "'";
+    error(tok[0], "expected '" + std::string(str) + "'");
   return tok.subspan(1);
 }
 
@@ -131,13 +113,13 @@ static std::string_view unquote(std::string_view s) {
 }
 
 template <typename E>
-static std::span<std::string_view>
-read_output_format(Context<E> &ctx, std::span<std::string_view> tok) {
-  tok = skip(ctx, tok, "(");
+std::span<std::string_view>
+Script<E>::read_output_format(std::span<std::string_view> tok) {
+  tok = skip(tok, "(");
   while (!tok.empty() && tok[0] != ")")
     tok = tok.subspan(1);
   if (tok.empty())
-    Fatal(ctx) << ctx.script_file->name << ": expected ')', but got EOF";
+    Fatal(ctx) << mf->name << ": expected ')', but got EOF";
   return tok.subspan(1);
 }
 
@@ -149,8 +131,7 @@ static bool is_in_sysroot(Context<E> &ctx, std::string path) {
 }
 
 template <typename E>
-static MappedFile *
-resolve_path(Context<E> &ctx, std::string_view tok, bool check_target) {
+MappedFile *Script<E>::resolve_path(std::string_view tok, bool check_target) {
   std::string str(unquote(tok));
 
   auto open = [&](const std::string &path) -> MappedFile * {
@@ -159,7 +140,7 @@ resolve_path(Context<E> &ctx, std::string_view tok, bool check_target) {
       return nullptr;
 
     if (check_target) {
-      std::string_view target = get_machine_type(ctx, mf);
+      std::string_view target = get_machine_type(ctx, rctx, mf);
       if (!target.empty() && target != E::target_name) {
         Warn(ctx) << path << ": skipping incompatible file: " << target
                   << " (e_machine " << (int)E::e_machine << ")";
@@ -171,7 +152,7 @@ resolve_path(Context<E> &ctx, std::string_view tok, bool check_target) {
 
   // GNU ld prepends the sysroot if a pathname starts with '/' and the
   // script being processed is in the sysroot. We do the same.
-  if (str.starts_with('/') && is_in_sysroot(ctx, ctx.script_file->name))
+  if (str.starts_with('/') && is_in_sysroot(ctx, mf->name))
     return must_open_file(ctx, ctx.arg.sysroot + str);
 
   if (str.starts_with('=')) {
@@ -184,11 +165,11 @@ resolve_path(Context<E> &ctx, std::string_view tok, bool check_target) {
   }
 
   if (str.starts_with("-l"))
-    return find_library(ctx, str.substr(2));
+    return find_library(ctx, rctx, str.substr(2));
 
   if (!str.starts_with('/'))
-    if (MappedFile *mf = open(path_clean(ctx.script_file->name + "/../" + str)))
-      return mf;
+    if (MappedFile *mf2 = open(path_clean(mf->name + "/../" + str)))
+      return mf2;
 
   if (MappedFile *mf = open(str))
     return mf;
@@ -199,50 +180,48 @@ resolve_path(Context<E> &ctx, std::string_view tok, bool check_target) {
       return mf;
   }
 
-  SyntaxError(ctx, tok) << "library not found: " << str;
+  error(tok, "library not found: " + str);
 }
 
 template <typename E>
-static std::span<std::string_view>
-read_group(Context<E> &ctx, std::span<std::string_view> tok) {
-  tok = skip(ctx, tok, "(");
+std::span<std::string_view>
+Script<E>::read_group(std::span<std::string_view> tok) {
+  tok = skip(tok, "(");
 
   while (!tok.empty() && tok[0] != ")") {
     if (tok[0] == "AS_NEEDED") {
-      bool orig = ctx.as_needed;
-      ctx.as_needed = true;
-      tok = read_group(ctx, tok.subspan(1));
-      ctx.as_needed = orig;
+      bool orig = rctx.as_needed;
+      rctx.as_needed = true;
+      tok = read_group(tok.subspan(1));
+      rctx.as_needed = orig;
       continue;
     }
 
-    MappedFile *mf = resolve_path(ctx, tok[0], true);
-    read_file(ctx, mf);
+    MappedFile *mf = resolve_path(tok[0], true);
+    read_file(ctx, rctx, mf);
     tok = tok.subspan(1);
   }
 
   if (tok.empty())
-    Fatal(ctx) << ctx.script_file->name << ": expected ')', but got EOF";
+    Fatal(ctx) << mf->name << ": expected ')', but got EOF";
   return tok.subspan(1);
 }
 
 template <typename E>
-void parse_linker_script(Context<E> &ctx, MappedFile *mf) {
-  ctx.script_file = mf;
-
-  std::vector<std::string_view> vec = tokenize(ctx, mf->get_contents());
-  std::span<std::string_view> tok = vec;
+void Script<E>::parse_linker_script() {
+  std::call_once(once, [&] { tokenize(); });
+  std::span<std::string_view> tok = tokens;
 
   while (!tok.empty()) {
     if (tok[0] == "OUTPUT_FORMAT") {
-      tok = read_output_format(ctx, tok.subspan(1));
+      tok = read_output_format(tok.subspan(1));
     } else if (tok[0] == "INPUT" || tok[0] == "GROUP") {
-      tok = read_group(ctx, tok.subspan(1));
+      tok = read_group(tok.subspan(1));
     } else if (tok[0] == "VERSION") {
       tok = tok.subspan(1);
-      tok = skip(ctx, tok, "{");
-      read_version_script(ctx, tok);
-      tok = skip(ctx, tok, "}");
+      tok = skip(tok, "{");
+      tok = read_version_script(tok);
+      tok = skip(tok, "}");
     } else if (tok.size() > 3 && tok[1] == "=" && tok[3] == ";") {
       ctx.arg.defsyms.emplace_back(get_symbol(ctx, unquote(tok[0])),
                                    get_symbol(ctx, unquote(tok[2])));
@@ -250,18 +229,15 @@ void parse_linker_script(Context<E> &ctx, MappedFile *mf) {
     } else if (tok[0] == ";") {
       tok = tok.subspan(1);
     } else {
-      SyntaxError(ctx, tok[0]) << "unknown linker script token";
+      error(tok[0], "unknown linker script token");
     }
   }
 }
 
 template <typename E>
-std::string_view
-get_script_output_type(Context<E> &ctx, MappedFile *mf) {
-  ctx.script_file = mf;
-
-  std::vector<std::string_view> vec = tokenize(ctx, mf->get_contents());
-  std::span<std::string_view> tok = vec;
+std::string_view Script<E>::get_script_output_type() {
+  std::call_once(once, [&] { tokenize(); });
+  std::span<std::string_view> tok = tokens;
 
   if (tok.size() >= 3 && tok[0] == "OUTPUT_FORMAT" && tok[1] == "(") {
     if (tok[2] == "elf64-x86-64")
@@ -272,14 +248,12 @@ get_script_output_type(Context<E> &ctx, MappedFile *mf) {
 
   if (tok.size() >= 3 && (tok[0] == "INPUT" || tok[0] == "GROUP") &&
       tok[1] == "(")
-    if (MappedFile *mf = resolve_path(ctx, tok[2], false))
-      return get_machine_type(ctx, mf);
-
+    if (MappedFile *mf = resolve_path(tok[2], false))
+      return get_machine_type(ctx, rctx, mf);
   return "";
 }
 
-static bool read_label(std::span<std::string_view> &tok,
-                       std::string label) {
+static bool read_label(std::span<std::string_view> &tok, std::string label) {
   if (tok.size() >= 1 && tok[0] == label + ":") {
     tok = tok.subspan(1);
     return true;
@@ -293,10 +267,10 @@ static bool read_label(std::span<std::string_view> &tok,
 }
 
 template <typename E>
-static void
-read_version_script_commands(Context<E> &ctx, std::span<std::string_view> &tok,
-                             std::string_view ver_str, u16 ver_idx,
-                             bool is_global, bool is_cpp) {
+std::span<std::string_view>
+Script<E>::read_version_script_commands(std::span<std::string_view> tok,
+                                     std::string_view ver_str, u16 ver_idx,
+                                     bool is_global, bool is_cpp) {
   while (!tok.empty() && tok[0] != "}") {
     if (read_label(tok, "global")) {
       is_global = true;
@@ -313,39 +287,41 @@ read_version_script_commands(Context<E> &ctx, std::span<std::string_view> &tok,
 
       if (!tok.empty() && tok[0] == "\"C\"") {
         tok = tok.subspan(1);
-        tok = skip(ctx, tok, "{");
-        read_version_script_commands( ctx, tok, ver_str, ver_idx, is_global, false);
+        tok = skip(tok, "{");
+        tok = read_version_script_commands(tok, ver_str, ver_idx, is_global, false);
       } else {
-        tok = skip(ctx, tok, "\"C++\"");
-        tok = skip(ctx, tok, "{");
-        read_version_script_commands(ctx, tok, ver_str, ver_idx, is_global, true);
+        tok = skip(tok, "\"C++\"");
+        tok = skip(tok, "{");
+        tok = read_version_script_commands(tok, ver_str, ver_idx, is_global, true);
       }
 
-      tok = skip(ctx, tok, "}");
-      tok = skip(ctx, tok, ";");
+      tok = skip(tok, "}");
+      tok = skip(tok, ";");
       continue;
     }
 
     if (tok[0] == "*") {
       ctx.default_version = (is_global ? ver_idx : (u32)VER_NDX_LOCAL);
     } else if (is_global) {
-      ctx.version_patterns.push_back({unquote(tok[0]), ctx.script_file->name,
-                                      ver_str, ver_idx, is_cpp});
+      ctx.version_patterns.push_back({unquote(tok[0]), mf->name, ver_str,
+                                      ver_idx, is_cpp});
     } else {
-      ctx.version_patterns.push_back({unquote(tok[0]), ctx.script_file->name,
-                                      ver_str, VER_NDX_LOCAL, is_cpp});
+      ctx.version_patterns.push_back({unquote(tok[0]), mf->name, ver_str,
+                                      VER_NDX_LOCAL, is_cpp});
     }
 
     tok = tok.subspan(1);
 
     if (!tok.empty() && tok[0] == "}")
-      return;
-    tok = skip(ctx, tok, ";");
+      break;
+    tok = skip(tok, ";");
   }
+  return tok;
 }
 
 template <typename E>
-void read_version_script(Context<E> &ctx, std::span<std::string_view> &tok) {
+std::span<std::string_view>
+Script<E>::read_version_script(std::span<std::string_view> tok) {
   u16 next_ver = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size() + 1;
 
   while (!tok.empty() && tok[0] != "}") {
@@ -362,83 +338,87 @@ void read_version_script(Context<E> &ctx, std::span<std::string_view> &tok) {
       tok = tok.subspan(1);
     }
 
-    tok = skip(ctx, tok, "{");
-    read_version_script_commands(ctx, tok, ver_str, ver_idx, true, false);
-    tok = skip(ctx, tok, "}");
+    tok = skip(tok, "{");
+    tok = read_version_script_commands(tok, ver_str, ver_idx, true, false);
+    tok = skip(tok, "}");
     if (!tok.empty() && tok[0] != ";")
       tok = tok.subspan(1);
-    tok = skip(ctx, tok, ";");
+    tok = skip(tok, ";");
   }
+  return tok;
 }
 
 template <typename E>
-void parse_version_script(Context<E> &ctx, MappedFile *mf) {
-  ctx.script_file = mf;
-  std::vector<std::string_view> vec = tokenize(ctx, mf->get_contents());
-  std::span<std::string_view> tok = vec;
-  read_version_script(ctx, tok);
+void Script<E>::parse_version_script() {
+  std::call_once(once, [&] { tokenize(); });
+  std::span<std::string_view> tok = tokens;
+  tok = read_version_script(tok);
   if (!tok.empty())
-    SyntaxError(ctx, tok[0]) << "trailing garbage token";
+    error(tok[0], "trailing garbage token");
 }
 
 template <typename E>
-void read_dynamic_list_commands(Context<E> &ctx,
-                                std::vector<DynamicPattern> &result,
-                                std::span<std::string_view> &tok,
-                                bool is_cpp) {
+std::span<std::string_view>
+Script<E>::read_dynamic_list_commands(std::span<std::string_view> tok,
+                                   std::vector<DynamicPattern> &result,
+                                   bool is_cpp) {
   while (!tok.empty() && tok[0] != "}") {
     if (tok[0] == "extern") {
       tok = tok.subspan(1);
 
       if (!tok.empty() && tok[0] == "\"C\"") {
         tok = tok.subspan(1);
-        tok = skip(ctx, tok, "{");
-        read_dynamic_list_commands(ctx, result, tok, false);
+        tok = skip(tok, "{");
+        tok = read_dynamic_list_commands(tok, result, false);
       } else {
-        tok = skip(ctx, tok, "\"C++\"");
-        tok = skip(ctx, tok, "{");
-        read_dynamic_list_commands(ctx, result, tok, true);
+        tok = skip(tok, "\"C++\"");
+        tok = skip(tok, "{");
+        tok = read_dynamic_list_commands(tok, result, true);
       }
 
-      tok = skip(ctx, tok, "}");
-      tok = skip(ctx, tok, ";");
+      tok = skip(tok, "}");
+      tok = skip(tok, ";");
       continue;
     }
 
     result.push_back({unquote(tok[0]), "", is_cpp});
-    tok = skip(ctx, tok.subspan(1), ";");
+    tok = skip(tok.subspan(1), ";");
   }
+  return tok;
 }
 
 template <typename E>
-std::vector<DynamicPattern>
-parse_dynamic_list(Context<E> &ctx, std::string_view path) {
-  std::string_view contents =
-    must_open_file(ctx, std::string(path))->get_contents();
-  std::vector<std::string_view> vec = tokenize(ctx, contents);
-  std::span<std::string_view> tok = vec;
+std::vector<DynamicPattern> Script<E>::parse_dynamic_list() {
+  std::call_once(once, [&] { tokenize(); });
+  std::span<std::string_view> tok = tokens;
   std::vector<DynamicPattern> result;
 
-  tok = skip(ctx, tok, "{");
-  read_dynamic_list_commands(ctx, result, tok, false);
-  tok = skip(ctx, tok, "}");
-  tok = skip(ctx, tok, ";");
+  tok = skip(tok, "{");
+  tok = read_dynamic_list_commands(tok, result, false);
+  tok = skip(tok, "}");
+  tok = skip(tok, ";");
 
   if (!tok.empty())
-    SyntaxError(ctx, tok[0]) << "trailing garbage token";
+    error(tok[0], "trailing garbage token");
 
   for (DynamicPattern &p : result)
-    p.source = path;
-
+    p.source = mf->name;
   return result;
 }
 
+template <typename E>
+std::vector<DynamicPattern>
+parse_dynamic_list(Context<E> &ctx, std::string_view path) {
+  ReaderContext rctx;
+  MappedFile *mf = must_open_file(ctx, std::string(path));
+  return Script(ctx, rctx, mf).parse_dynamic_list();
+}
+
 using E = MOLD_TARGET;
 
-template void parse_linker_script(Context<E> &, MappedFile *);
-template std::string_view get_script_output_type(Context<E> &, MappedFile *);
-template void parse_version_script(Context<E> &, MappedFile *);
-template std::vector<DynamicPattern> parse_dynamic_list(Context<E> &, std::string_view);
+template class Script<E>;
 
+template
+std::vector<DynamicPattern> parse_dynamic_list(Context<E> &, std::string_view);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/lto-unix.cc b/src/lto-unix.cc
similarity index 98%
rename from elf/lto-unix.cc
rename to src/lto-unix.cc
index e7c22e71..740842f8 100644
--- a/elf/lto-unix.cc
+++ b/src/lto-unix.cc
@@ -95,7 +95,7 @@
 # define LOG std::ostringstream()
 #endif
 
-namespace mold::elf {
+namespace mold {
 
 // Global variables
 // We store LTO-related information to global variables,
@@ -567,11 +567,7 @@ static ElfSym<E> to_elf_sym(PluginSymbol &psym) {
 // Returns false if it's GCC.
 template <typename E>
 static bool is_llvm(Context<E> &ctx) {
-#ifdef __MINGW32__
-  return ctx.arg.plugin.ends_with("LLVMgold.dll");
-#else
-  return ctx.arg.plugin.ends_with("LLVMgold.so");
-#endif
+  return ctx.arg.plugin.find("LLVMgold.") != ctx.arg.plugin.npos;
 }
 
 // Returns true if a given linker plugin supports the get_symbols_v3 API.
@@ -678,8 +674,8 @@ ObjectFile<E> *read_lto_object(Context<E> &ctx, MappedFile *mf) {
 
 // Entry point
 template <typename E>
-std::vector<ObjectFile<E> *> do_lto(Context<E> &ctx) {
-  Timer t(ctx, "do_lto");
+std::vector<ObjectFile<E> *> run_lto_plugin(Context<E> &ctx) {
+  Timer t(ctx, "run_lto_plugin");
   load_lto_plugin(ctx);
 
   if (!ctx.arg.lto_pass2 && !supports_v3_api(ctx))
@@ -747,7 +743,7 @@ void lto_cleanup(Context<E> &ctx) {
 using E = MOLD_TARGET;
 
 template ObjectFile<E> *read_lto_object(Context<E> &, MappedFile *);
-template std::vector<ObjectFile<E> *> do_lto(Context<E> &);
+template std::vector<ObjectFile<E> *> run_lto_plugin(Context<E> &);
 template void lto_cleanup(Context<E> &);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/lto-win32.cc b/src/lto-win32.cc
similarity index 71%
rename from elf/lto-win32.cc
rename to src/lto-win32.cc
index 456e406f..f5d17eec 100644
--- a/elf/lto-win32.cc
+++ b/src/lto-win32.cc
@@ -1,7 +1,7 @@
 #include "mold.h"
 #include "lto.h"
 
-namespace mold::elf {
+namespace mold {
 
 template <typename E>
 ObjectFile<E> *read_lto_object(Context<E> &ctx, MappedFile *mf) {
@@ -9,7 +9,7 @@ ObjectFile<E> *read_lto_object(Context<E> &ctx, MappedFile *mf) {
 }
 
 template <typename E>
-std::vector<ObjectFile<E> *> do_lto(Context<E> &ctx) {
+std::vector<ObjectFile<E> *> run_lto_plugin(Context<E> &ctx) {
   return {};
 }
 
@@ -19,7 +19,7 @@ void lto_cleanup(Context<E> &ctx) {}
 using E = MOLD_TARGET;
 
 template ObjectFile<E> *read_lto_object(Context<E> &, MappedFile *);
-template std::vector<ObjectFile<E> *> do_lto(Context<E> &);
+template std::vector<ObjectFile<E> *> run_lto_plugin(Context<E> &);
 template void lto_cleanup(Context<E> &);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/lto.h b/src/lto.h
similarity index 98%
rename from elf/lto.h
rename to src/lto.h
index 5f2225d2..f1795534 100644
--- a/elf/lto.h
+++ b/src/lto.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "../common/integers.h"
+#include "../lib/integers.h"
 
 namespace mold {
 
diff --git a/elf/main.cc b/src/main.cc
similarity index 72%
rename from elf/main.cc
rename to src/main.cc
index c3b1d929..ce94043c 100644
--- a/elf/main.cc
+++ b/src/main.cc
@@ -1,6 +1,6 @@
 #include "mold.h"
-#include "../common/archive-file.h"
-#include "../common/output-file.h"
+#include "filetype.h"
+#include "../lib/archive-file.h"
 
 #include <cstring>
 #include <functional>
@@ -23,16 +23,17 @@
 
 #ifdef MOLD_X86_64
 int main(int argc, char **argv) {
-  return mold::elf::elf_main<mold::elf::X86_64>(argc, argv);
+  return mold::mold_main<mold::X86_64>(argc, argv);
 }
 #endif
 
-namespace mold::elf {
+namespace mold {
 
 // Read the beginning of a given file and returns its machine type
 // (e.g. EM_X86_64 or EM_386).
 template <typename E>
-std::string_view get_machine_type(Context<E> &ctx, MappedFile *mf) {
+std::string_view
+get_machine_type(Context<E> &ctx, ReaderContext &rctx, MappedFile *mf) {
   auto get_elf_type = [&](u8 *buf) -> std::string_view {
     bool is_le = (((ElfEhdr<I386> *)buf)->e_ident[EI_DATA] == ELFDATA2LSB);
     bool is_64;
@@ -73,8 +74,6 @@ std::string_view get_machine_type(Context<E> &ctx, MappedFile *mf) {
       return M68K::target_name;
     case EM_SH:
       return SH4::target_name;
-    case EM_ALPHA:
-      return ALPHA::target_name;
     case EM_LOONGARCH:
       return is_64 ? LOONGARCH64::target_name : LOONGARCH32::target_name;
     default:
@@ -100,7 +99,7 @@ std::string_view get_machine_type(Context<E> &ctx, MappedFile *mf) {
         return get_elf_type(child->data);
     return "";
   case FileType::TEXT:
-    return get_script_output_type(ctx, mf);
+    return Script(ctx, rctx, mf).get_script_output_type();
   default:
     return "";
   }
@@ -108,33 +107,33 @@ std::string_view get_machine_type(Context<E> &ctx, MappedFile *mf) {
 
 template <typename E>
 static void
-check_file_compatibility(Context<E> &ctx, MappedFile *mf) {
-  std::string_view target = get_machine_type(ctx, mf);
+check_file_compatibility(Context<E> &ctx, ReaderContext &rctx, MappedFile *mf) {
+  std::string_view target = get_machine_type(ctx, rctx, mf);
   if (target != ctx.arg.emulation)
     Fatal(ctx) << mf->name << ": incompatible file type: "
                << ctx.arg.emulation << " is expected but got " << target;
 }
 
 template <typename E>
-static ObjectFile<E> *new_object_file(Context<E> &ctx, MappedFile *mf,
-                                      std::string archive_name) {
+static ObjectFile<E> *new_object_file(Context<E> &ctx, ReaderContext &rctx,
+                                      MappedFile *mf, std::string archive_name) {
   static Counter count("parsed_objs");
   count++;
 
-  check_file_compatibility(ctx, mf);
+  check_file_compatibility(ctx, rctx, mf);
 
-  bool in_lib = ctx.in_lib || (!archive_name.empty() && !ctx.whole_archive);
+  bool in_lib = rctx.in_lib || (!archive_name.empty() && !rctx.whole_archive);
   ObjectFile<E> *file = ObjectFile<E>::create(ctx, mf, archive_name, in_lib);
   file->priority = ctx.file_priority++;
-  ctx.tg.run([file, &ctx] { file->parse(ctx); });
+  rctx.tg->run([file, &ctx] { file->parse(ctx); });
   if (ctx.arg.trace)
     Out(ctx) << "trace: " << *file;
   return file;
 }
 
 template <typename E>
-static ObjectFile<E> *new_lto_obj(Context<E> &ctx, MappedFile *mf,
-                                  std::string archive_name) {
+static ObjectFile<E> *new_lto_obj(Context<E> &ctx, ReaderContext &rctx,
+                                  MappedFile *mf, std::string archive_name) {
   static Counter count("parsed_lto_objs");
   count++;
 
@@ -144,7 +143,7 @@ static ObjectFile<E> *new_lto_obj(Context<E> &ctx, MappedFile *mf,
   ObjectFile<E> *file = read_lto_object(ctx, mf);
   file->priority = ctx.file_priority++;
   file->archive_name = archive_name;
-  file->is_in_lib = ctx.in_lib || (!archive_name.empty() && !ctx.whole_archive);
+  file->is_in_lib = rctx.in_lib || (!archive_name.empty() && !rctx.whole_archive);
   file->is_alive = !file->is_in_lib;
   if (ctx.arg.trace)
     Out(ctx) << "trace: " << *file;
@@ -153,40 +152,37 @@ static ObjectFile<E> *new_lto_obj(Context<E> &ctx, MappedFile *mf,
 
 template <typename E>
 static SharedFile<E> *
-new_shared_file(Context<E> &ctx, MappedFile *mf) {
-  check_file_compatibility(ctx, mf);
+new_shared_file(Context<E> &ctx, ReaderContext &rctx, MappedFile *mf) {
+  check_file_compatibility(ctx, rctx, mf);
 
   SharedFile<E> *file = SharedFile<E>::create(ctx, mf);
   file->priority = ctx.file_priority++;
-  ctx.tg.run([file, &ctx] { file->parse(ctx); });
+  file->is_alive = !rctx.as_needed;
+  rctx.tg->run([file, &ctx] { file->parse(ctx); });
   if (ctx.arg.trace)
     Out(ctx) << "trace: " << *file;
   return file;
 }
 
 template <typename E>
-void read_file(Context<E> &ctx, MappedFile *mf) {
-  if (ctx.visited.contains(mf->name))
-    return;
-
+void read_file(Context<E> &ctx, ReaderContext &rctx, MappedFile *mf) {
   switch (get_file_type(ctx, mf)) {
   case FileType::ELF_OBJ:
-    ctx.objs.push_back(new_object_file(ctx, mf, ""));
+    ctx.objs.push_back(new_object_file(ctx, rctx, mf, ""));
     return;
   case FileType::ELF_DSO:
-    ctx.dsos.push_back(new_shared_file(ctx, mf));
-    ctx.visited.insert(mf->name);
+    ctx.dsos.push_back(new_shared_file(ctx, rctx, mf));
     return;
   case FileType::AR:
   case FileType::THIN_AR:
     for (MappedFile *child : read_archive_members(ctx, mf)) {
       switch (get_file_type(ctx, child)) {
       case FileType::ELF_OBJ:
-        ctx.objs.push_back(new_object_file(ctx, child, mf->name));
+        ctx.objs.push_back(new_object_file(ctx, rctx, child, mf->name));
         break;
       case FileType::GCC_LTO_OBJ:
       case FileType::LLVM_BITCODE:
-        if (ObjectFile<E> *file = new_lto_obj(ctx, child, mf->name))
+        if (ObjectFile<E> *file = new_lto_obj(ctx, rctx, child, mf->name))
           ctx.objs.push_back(file);
         break;
       case FileType::ELF_DSO:
@@ -197,15 +193,13 @@ void read_file(Context<E> &ctx, MappedFile *mf) {
         break;
       }
     }
-    if (!ctx.whole_archive)
-      ctx.visited.insert(mf->name);
     return;
   case FileType::TEXT:
-    parse_linker_script(ctx, mf);
+    Script(ctx, rctx, mf).parse_linker_script();
     return;
   case FileType::GCC_LTO_OBJ:
   case FileType::LLVM_BITCODE:
-    if (ObjectFile<E> *file = new_lto_obj(ctx, mf, ""))
+    if (ObjectFile<E> *file = new_lto_obj(ctx, rctx, mf, ""))
       ctx.objs.push_back(file);
     return;
   default:
@@ -215,33 +209,46 @@ void read_file(Context<E> &ctx, MappedFile *mf) {
 
 template <typename E>
 static std::string_view
-detect_machine_type(Context<E> &ctx, std::vector<std::string> paths) {
-  std::erase(paths, "-");
-
-  for (const std::string &path : paths)
-    if (auto *mf = open_file(ctx, path))
-      if (get_file_type(ctx, mf) != FileType::TEXT)
-        if (std::string_view target = get_machine_type(ctx, mf);
-            !target.empty())
-          return target;
-
-  for (const std::string &path : paths)
-    if (auto *mf = open_file(ctx, path))
-      if (get_file_type(ctx, mf) == FileType::TEXT)
-        if (std::string_view target = get_script_output_type(ctx, mf);
-            !target.empty())
-          return target;
+detect_machine_type(Context<E> &ctx, std::vector<std::string> args) {
+  for (ReaderContext rctx; const std::string &arg : args) {
+    if (arg == "--Bstatic") {
+      rctx.static_ = true;
+    } else if (arg == "--Bdynamic") {
+      rctx.static_ = false;
+    } else if (!arg.starts_with('-')) {
+      if (MappedFile *mf = open_file(ctx, arg))
+        if (get_file_type(ctx, mf) != FileType::TEXT)
+          if (std::string_view target = get_machine_type(ctx, rctx, mf);
+              !target.empty())
+            return target;
+    }
+  }
+
+  for (ReaderContext rctx; const std::string &arg : args) {
+    if (arg == "--Bstatic") {
+      rctx.static_ = true;
+    } else if (arg == "--Bdynamic") {
+      rctx.static_ = false;
+    } else if (!arg.starts_with('-')) {
+      if (MappedFile *mf = open_file(ctx, arg))
+        if (get_file_type(ctx, mf) == FileType::TEXT)
+          if (std::string_view target =
+              Script(ctx, rctx, mf).get_script_output_type();
+              !target.empty())
+            return target;
+    }
+  }
 
   Fatal(ctx) << "-m option is missing";
 }
 
 template <typename E>
-MappedFile *open_library(Context<E> &ctx, std::string path) {
+MappedFile *open_library(Context<E> &ctx, ReaderContext &rctx, std::string path) {
   MappedFile *mf = open_file(ctx, path);
   if (!mf)
     return nullptr;
 
-  std::string_view target = get_machine_type(ctx, mf);
+  std::string_view target = get_machine_type(ctx, rctx, mf);
   if (!target.empty() && target != E::target_name) {
     Warn(ctx) << path << ": skipping incompatible file: " << target
               << " (e_machine " << (int)E::e_machine << ")";
@@ -251,11 +258,11 @@ MappedFile *open_library(Context<E> &ctx, std::string path) {
 }
 
 template <typename E>
-MappedFile *find_library(Context<E> &ctx, std::string name) {
+MappedFile *find_library(Context<E> &ctx, ReaderContext &rctx, std::string name) {
   if (name.starts_with(':')) {
     for (std::string_view dir : ctx.arg.library_paths) {
       std::string path = std::string(dir) + "/" + name.substr(1);
-      if (MappedFile *mf = open_library(ctx, path))
+      if (MappedFile *mf = open_library(ctx, rctx, path))
         return mf;
     }
     Fatal(ctx) << "library not found: " << name;
@@ -263,94 +270,88 @@ MappedFile *find_library(Context<E> &ctx, std::string name) {
 
   for (std::string_view dir : ctx.arg.library_paths) {
     std::string stem = std::string(dir) + "/lib" + name;
-    if (!ctx.is_static)
-      if (MappedFile *mf = open_library(ctx, stem + ".so"))
+    if (!rctx.static_)
+      if (MappedFile *mf = open_library(ctx, rctx, stem + ".so"))
         return mf;
-    if (MappedFile *mf = open_library(ctx, stem + ".a"))
+    if (MappedFile *mf = open_library(ctx, rctx, stem + ".a"))
       return mf;
   }
   Fatal(ctx) << "library not found: " << name;
 }
 
-template <typename E>
-MappedFile *find_from_search_paths(Context<E> &ctx, std::string name) {
-  if (MappedFile *mf = open_file(ctx, name))
-    return mf;
-
-  for (std::string_view dir : ctx.arg.library_paths)
-    if (MappedFile *mf =
-        open_file(ctx, std::string(dir) + "/" + name))
-      return mf;
-  return nullptr;
-}
-
 template <typename E>
 static void read_input_files(Context<E> &ctx, std::span<std::string> args) {
   Timer t(ctx, "read_input_files");
 
-  std::vector<std::tuple<bool, bool, bool, bool>> state;
-  ctx.is_static = ctx.arg.is_static;
+  ReaderContext rctx;
+  std::vector<ReaderContext> stack;
+  std::unordered_set<std::string_view> visited;
+
+  tbb::task_group tg;
+  rctx.tg = &tg;
 
   while (!args.empty()) {
     std::string_view arg = args[0];
     args = args.subspan(1);
 
     if (arg == "--as-needed") {
-      ctx.as_needed = true;
+      rctx.as_needed = true;
     } else if (arg == "--no-as-needed") {
-      ctx.as_needed = false;
+      rctx.as_needed = false;
     } else if (arg == "--whole-archive") {
-      ctx.whole_archive = true;
+      rctx.whole_archive = true;
     } else if (arg == "--no-whole-archive") {
-      ctx.whole_archive = false;
+      rctx.whole_archive = false;
     } else if (arg == "--Bstatic") {
-      ctx.is_static = true;
+      rctx.static_ = true;
     } else if (arg == "--Bdynamic") {
-      ctx.is_static = false;
+      rctx.static_ = false;
     } else if (arg == "--start-lib") {
-      ctx.in_lib = true;
+      rctx.in_lib = true;
     } else if (arg == "--end-lib") {
-      ctx.in_lib = false;
-    } else if (remove_prefix(arg, "--version-script=")) {
-      MappedFile *mf = find_from_search_paths(ctx, std::string(arg));
-      if (!mf)
-        Fatal(ctx) << "--version-script: file not found: " << arg;
-      parse_version_script(ctx, mf);
+      rctx.in_lib = false;
     } else if (arg == "--push-state") {
-      state.push_back({ctx.as_needed, ctx.whole_archive, ctx.is_static,
-                       ctx.in_lib});
+      stack.push_back(rctx);
     } else if (arg == "--pop-state") {
-      if (state.empty())
+      if (stack.empty())
         Fatal(ctx) << "no state pushed before popping";
-      std::tie(ctx.as_needed, ctx.whole_archive, ctx.is_static, ctx.in_lib) =
-        state.back();
-      state.pop_back();
-    } else if (remove_prefix(arg, "-l")) {
-      MappedFile *mf = find_library(ctx, std::string(arg));
+      rctx = stack.back();
+      stack.pop_back();
+    } else if (arg.starts_with("-l")) {
+      arg = arg.substr(2);
+      if (visited.contains(arg))
+        continue;
+      visited.insert(arg);
+
+      MappedFile *mf = find_library(ctx, rctx, std::string(arg));
       mf->given_fullpath = false;
-      read_file(ctx, mf);
+      read_file(ctx, rctx, mf);
     } else {
-      read_file(ctx, must_open_file(ctx, std::string(arg)));
+      read_file(ctx, rctx, must_open_file(ctx, std::string(arg)));
     }
   }
 
   if (ctx.objs.empty())
     Fatal(ctx) << "no input files";
 
-  ctx.tg.wait();
+  tg.wait();
+}
+
+template <typename E>
+static bool has_lto_obj(Context<E> &ctx) {
+  for (ObjectFile<E> *file : ctx.objs)
+    if (file->is_alive && (file->is_lto_obj || file->is_gcc_offload_obj))
+      return true;
+  return false;
 }
 
 template <typename E>
-int elf_main(int argc, char **argv) {
+int mold_main(int argc, char **argv) {
   Context<E> ctx;
 
   // Process -run option first. process_run_subcommand() does not return.
-  if (argc >= 2 && (argv[1] == "-run"sv || argv[1] == "--run"sv)) {
-#if defined(_WIN32) || defined(__APPLE__)
-    Fatal(ctx) << "-run is supported only on Unix";
-#endif
+  if (argc >= 2 && (argv[1] == "-run"sv || argv[1] == "--run"sv))
     process_run_subcommand(ctx, argc, argv);
-  }
 
   // Parse non-positional command line options
   ctx.cmdline_args = expand_response_files(ctx, argv);
@@ -375,12 +376,8 @@ int elf_main(int argc, char **argv) {
                  << ": " << errno_string();
 
   // Fork a subprocess unless --no-fork is given.
-  std::function<void()> on_complete;
-
-#if !defined(_WIN32) && !defined(__APPLE__)
   if (ctx.arg.fork)
-    on_complete = fork_child();
-#endif
+    fork_child();
 
   acquire_global_lock();
 
@@ -393,8 +390,8 @@ int elf_main(int argc, char **argv) {
 
   // Handle --retain-symbols-file options if any.
   if (ctx.arg.retain_symbols_file)
-    for (std::string_view name : *ctx.arg.retain_symbols_file)
-      get_symbol(ctx, name)->write_to_symtab = true;
+    for (Symbol<E> *sym : *ctx.arg.retain_symbols_file)
+      sym->write_to_symtab = true;
 
   for (std::string_view arg : ctx.arg.trace_symbol)
     get_symbol(ctx, arg)->is_traced = true;
@@ -419,26 +416,26 @@ int elf_main(int argc, char **argv) {
   if (!ctx.arg.relocatable)
     create_internal_file(ctx);
 
-  // resolve_symbols is 4 things in 1 phase:
-  //
-  // - Determine the set of object files to extract from archives.
-  // - Remove redundant COMDAT sections (e.g. duplicate inline functions).
-  // - Finally, the actual symbol resolution.
-  // - LTO, which requires preliminary symbol resolution before running
-  //   and a follow-up re-resolution after the LTO objects are emitted.
-  //
-  // These passes have complex interactions, and unfortunately has to be
-  // put together in a single phase.
+  // Resolve symbols by choosing the most appropriate file for each
+  // symbol. This pass also removes redundant comdat sections (e.g.
+  // duplicate inline functions).
   resolve_symbols(ctx);
 
-  // "Kill" .eh_frame input sections after symbol resolution.
-  kill_eh_frame_sections(ctx);
+  // If there's an object file compiled with -flto, do link-time
+  // optimization.
+  if (has_lto_obj(ctx))
+    do_lto(ctx);
 
-  // Split mergeable section contents into section pieces.
-  split_section_pieces(ctx);
+  // Now that we know which object files are to be included to the
+  // final output, we can remove unnecessary files.
+  std::erase_if(ctx.objs, [](InputFile<E> *file) { return !file->is_alive; });
+  std::erase_if(ctx.dsos, [](InputFile<E> *file) { return !file->is_alive; });
 
-  // Resolve mergeable section pieces to merge them.
-  resolve_section_pieces(ctx);
+  // Parse .eh_frame section contents.
+  parse_eh_frame_sections(ctx);
+
+  // Split mergeable section contents into section pieces.
+  create_merged_sections(ctx);
 
   // Handle --relocatable. Since the linker's behavior is quite different
   // from the normal one when the option is given, the logic is implemented
@@ -472,9 +469,6 @@ int elf_main(int argc, char **argv) {
   if (ctx.arg.icf)
     icf_sections(ctx);
 
-  // Compute sizes of sections containing mergeable strings.
-  compute_merged_section_sizes(ctx);
-
   // Create linker-synthesized sections such as .got or .plt.
   create_synthetic_sections(ctx);
 
@@ -482,6 +476,9 @@ int elf_main(int argc, char **argv) {
   if (!ctx.arg.allow_multiple_definition)
     check_duplicate_symbols(ctx);
 
+  if (!ctx.arg.allow_shlib_undefined)
+    check_shlib_undefined(ctx);
+
   // Warn if symbols with different types are defined under the same name.
   check_symbol_types(ctx);
 
@@ -491,6 +488,10 @@ int elf_main(int argc, char **argv) {
   // Bin input sections into output sections.
   create_output_sections(ctx);
 
+  // Convert an .ARM.exidx to a synthetic section.
+  if constexpr (is_arm32<E>)
+    create_arm_exidx_section(ctx);
+
   // Handle --section-align options.
   if (!ctx.arg.section_align.empty())
     apply_section_align(ctx);
@@ -573,14 +574,17 @@ int elf_main(int argc, char **argv) {
   // Compute the is_weak bit for each imported symbol.
   compute_imported_symbol_weakness(ctx);
 
-  // Compute sizes of output sections while assigning offsets
-  // within an output section to input sections.
-  compute_section_sizes(ctx);
-
   // Sort sections by section attributes so that we'll have to
   // create as few segments as possible.
   sort_output_sections(ctx);
 
+  if (!ctx.arg.separate_debug_file.empty())
+    separate_debug_sections(ctx);
+
+  // Compute sizes of output sections while assigning offsets
+  // within an output section to input sections.
+  compute_section_sizes(ctx);
+
   // If --packed_dyn_relocs=relr was given, base relocations are stored
   // to a .relr.dyn section in a compressed form. Construct a compressed
   // relocations now so that we can fix section sizes and file layout.
@@ -590,7 +594,7 @@ int elf_main(int argc, char **argv) {
   // Reserve a space for dynamic symbol strings in .dynstr and sort
   // .dynsym contents if necessary. Beyond this point, no symbol will
   // be added to .dynsym.
-  ctx.dynsym->finalize(ctx);
+  sort_dynsyms(ctx);
 
   // Print reports about undefined symbols, if needed.
   if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR)
@@ -604,7 +608,8 @@ int elf_main(int argc, char **argv) {
   ctx.verneed->construct(ctx);
 
   // Compute .symtab and .strtab sizes for each file.
-  create_output_symtab(ctx);
+  if (!ctx.arg.strip_all)
+    create_output_symtab(ctx);
 
   // .eh_frame is a special section from the linker's point of view,
   // as its contents are parsed and reconstructed by the linker,
@@ -627,8 +632,17 @@ int elf_main(int argc, char **argv) {
   // that they can jump to anywhere in ±2 GiB by default. They may
   // be replaced with shorter instruction sequences if destinations
   // are close enough. Do this optimization.
-  if constexpr (is_riscv<E>)
-    filesize = riscv_resize_sections(ctx);
+  if constexpr (is_riscv<E> || is_loongarch<E>) {
+    shrink_sections(ctx);
+    filesize = set_osec_offsets(ctx);
+  }
+
+  if constexpr (is_arm32<E>) {
+    if (ctx.extra.exidx) {
+      ctx.extra.exidx->remove_duplicate_entries(ctx);
+      filesize = set_osec_offsets(ctx);
+    }
+  }
 
   // At this point, memory layout is fixed.
 
@@ -640,16 +654,17 @@ int elf_main(int argc, char **argv) {
 
   // If --compress-debug-sections is given, compress .debug_* sections
   // using zlib.
-  if (ctx.arg.compress_debug_sections != COMPRESS_NONE)
-    filesize = compress_debug_sections(ctx);
+  if (ctx.arg.compress_debug_sections != COMPRESS_NONE) {
+    compress_debug_sections(ctx);
+    filesize = set_osec_offsets(ctx);
+  }
 
   // At this point, both memory and file layouts are fixed.
 
   t_before_copy.stop();
 
   // Create an output file
-  ctx.output_file =
-    OutputFile<Context<E>>::open(ctx, ctx.arg.output, filesize, 0777);
+  ctx.output_file = OutputFile<E>::open(ctx, ctx.arg.output, filesize, 0777);
   ctx.buf = ctx.output_file->buf;
 
   Timer t_copy(ctx, "copy");
@@ -657,27 +672,28 @@ int elf_main(int argc, char **argv) {
   // Copy input sections to the output file and apply relocations.
   copy_chunks(ctx);
 
-  if (ctx.arg.z_rewrite_endbr)
-    rewrite_endbr(ctx);
+  if constexpr (is_x86_64<E>)
+    if (ctx.arg.z_rewrite_endbr)
+      rewrite_endbr(ctx);
 
   // Dynamic linker works better with sorted .rela.dyn section,
   // so we sort them.
   ctx.reldyn->sort(ctx);
 
-  // Zero-clear paddings between sections
-  clear_padding(ctx);
+  // .note.gnu.build-id section contains a cryptographic hash of the
+  // entire output file. Now that we wrote everything except build-id,
+  // we can compute it.
+  if (ctx.buildid)
+    write_build_id(ctx);
 
   // .gdb_index's contents cannot be constructed before applying
   // relocations to other debug sections. We have relocated debug
   // sections now, so write the .gdb_index section.
-  if (ctx.gdb_index)
+  if (ctx.gdb_index && ctx.arg.separate_debug_file.empty())
     write_gdb_index(ctx);
 
-  // .note.gnu.build-id section contains a cryptographic hash of the
-  // entire output file. Now that we wrote everything except build-id,
-  // we can compute it.
-  if (ctx.buildid)
-    ctx.buildid->write_buildid(ctx);
+  if (!ctx.arg.separate_debug_file.empty())
+    write_gnu_debuglink(ctx);
 
   t_copy.stop();
   ctx.checkpoint();
@@ -697,6 +713,9 @@ int elf_main(int argc, char **argv) {
   if (ctx.arg.print_map)
     print_map(ctx);
 
+  if (!ctx.arg.separate_debug_file.empty())
+    write_separate_debug_file(ctx);
+
   // Show stats numbers
   if (ctx.arg.stats)
     show_stats(ctx);
@@ -707,9 +726,7 @@ int elf_main(int argc, char **argv) {
   std::cout << std::flush;
   std::cerr << std::flush;
 
-  if (on_complete)
-    on_complete();
-
+  notify_parent();
   release_global_lock();
 
   if (ctx.arg.quick_exit)
@@ -723,6 +740,6 @@ int elf_main(int argc, char **argv) {
 
 using E = MOLD_TARGET;
 
-template int elf_main<E>(int, char **);
+template int mold_main<E>(int, char **);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/mapfile.cc b/src/mapfile.cc
similarity index 98%
rename from elf/mapfile.cc
rename to src/mapfile.cc
index 4e730dd9..8d60971b 100644
--- a/elf/mapfile.cc
+++ b/src/mapfile.cc
@@ -7,7 +7,7 @@
 #include <tbb/parallel_for_each.h>
 #include <unordered_map>
 
-namespace mold::elf {
+namespace mold {
 
 template <typename E>
 using Map =
@@ -114,4 +114,4 @@ using E = MOLD_TARGET;
 
 template void print_map(Context<E> &ctx);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/mold-wrapper.c b/src/mold-wrapper.c
similarity index 98%
rename from elf/mold-wrapper.c
rename to src/mold-wrapper.c
index 6dadf811..d63d4de1 100644
--- a/elf/mold-wrapper.c
+++ b/src/mold-wrapper.c
@@ -9,7 +9,7 @@
 #include <string.h>
 #include <unistd.h>
 
-#if !defined(__OpenBSD__) && !defined(__FreeBSD__)
+#if __has_include(<alloca.h>)
 # include <alloca.h>
 #endif
 
diff --git a/elf/mold.h b/src/mold.h
similarity index 88%
rename from elf/mold.h
rename to src/mold.h
index 79946c60..322a0ea8 100644
--- a/elf/mold.h
+++ b/src/mold.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include "../lib/common.h"
 #include "elf.h"
-#include "../common/common.h"
 
 #include <atomic>
 #include <bitset>
@@ -34,7 +34,7 @@
 # include <unistd.h>
 #endif
 
-namespace mold::elf {
+namespace mold {
 
 template <typename E> class InputFile;
 template <typename E> class InputSection;
@@ -47,6 +47,7 @@ template <typename E> class Symbol;
 template <typename E> struct CieRecord;
 template <typename E> struct Context;
 template <typename E> struct FdeRecord;
+template <typename E> class MergeableSection;
 template <typename E> class RelocSection;
 
 template <typename E>
@@ -59,7 +60,7 @@ std::string get_mold_version();
 //
 
 template <typename E>
-struct SectionFragment {
+struct __attribute__((aligned(4))) SectionFragment {
   SectionFragment(MergedSection<E> *sec, bool is_alive)
     : output_section(*sec), is_alive(is_alive) {}
 
@@ -236,14 +237,14 @@ struct InputSectionExtras<E> {
   std::vector<ThunkRef> thunk_refs;
 };
 
-template <is_riscv E>
+template <typename E> requires is_riscv<E> || is_loongarch<E>
 struct InputSectionExtras<E> {
   std::vector<i32> r_deltas;
 };
 
 // InputSection represents a section in an input object file.
 template <typename E>
-class InputSection {
+class __attribute__((aligned(4))) InputSection {
 public:
   InputSection(Context<E> &ctx, ObjectFile<E> &file, i64 shndx);
 
@@ -263,7 +264,7 @@ class InputSection {
   std::span<FdeRecord<E>> get_fdes() const;
   std::string_view get_func_name(Context<E> &ctx, i64 offset) const;
   bool is_relr_reloc(Context<E> &ctx, const ElfRel<E> &rel) const;
-  bool is_killed_by_icf() const;
+  bool icf_removed() const;
   bool record_undef_error(Context<E> &ctx, const ElfRel<E> &rel);
 
   std::pair<SectionFragment<E> *, i64>
@@ -312,8 +313,6 @@ class InputSection {
 private:
   void scan_pcrel(Context<E> &ctx, Symbol<E> &sym, const ElfRel<E> &rel);
   void scan_absrel(Context<E> &ctx, Symbol<E> &sym, const ElfRel<E> &rel);
-  void scan_dyn_absrel(Context<E> &ctx, Symbol<E> &sym, const ElfRel<E> &rel);
-  void scan_toc_rel(Context<E> &ctx, Symbol<E> &sym, const ElfRel<E> &rel);
   void scan_tlsdesc(Context<E> &ctx, Symbol<E> &sym);
   void check_tlsle(Context<E> &ctx, Symbol<E> &sym, const ElfRel<E> &rel);
 
@@ -323,8 +322,6 @@ class InputSection {
   void apply_toc_rel(Context<E> &ctx, Symbol<E> &sym, const ElfRel<E> &rel,
                      u8 *loc, u64 S, i64 A, u64 P, ElfRel<E> **dynrel);
 
-  void copy_contents_riscv(Context<E> &ctx, u8 *buf);
-
   u64 get_thunk_addr(i64 idx);
 
   std::optional<u64> get_tombstone(Symbol<E> &sym, SectionFragment<E> *frag);
@@ -334,19 +331,18 @@ class InputSection {
 // tls.cc
 //
 
-template <typename E> u64 get_tls_begin(Context<E> &);
-template <typename E> u64 get_tp_addr(Context<E> &);
-template <typename E> u64 get_dtp_addr(Context<E> &);
+template <typename E> u64 get_tp_addr(const ElfPhdr<E> &);
+template <typename E> u64 get_dtp_addr(const ElfPhdr<E> &);
 
 //
 // output-chunks.cc
 //
 
 template <typename E>
-OutputSection<E> *find_section(Context<E> &ctx, u32 sh_type);
+Chunk<E> *find_chunk(Context<E> &ctx, u32 sh_type);
 
 template <typename E>
-OutputSection<E> *find_section(Context<E> &ctx, std::string_view name);
+Chunk<E> *find_chunk(Context<E> &ctx, std::string_view name);
 
 template <typename E>
 u64 get_eflags(Context<E> &ctx) {
@@ -367,15 +363,16 @@ void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym);
 
 // Chunk represents a contiguous region in an output file.
 template <typename E>
-class Chunk {
+class __attribute__((aligned(4))) Chunk {
 public:
   virtual ~Chunk() = default;
   virtual bool is_header() { return false; }
   virtual OutputSection<E> *to_osec() { return nullptr; }
+  virtual void compute_section_size(Context<E> &ctx) {}
   virtual i64 get_reldyn_size(Context<E> &ctx) const { return 0; }
   virtual void construct_relr(Context<E> &ctx) {}
   virtual void copy_buf(Context<E> &ctx) {}
-  virtual void write_to(Context<E> &ctx, u8 *buf) { unreachable(); }
+  virtual void write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) { unreachable(); }
   virtual void update_shdr(Context<E> &ctx) {}
 
   std::string_view name;
@@ -468,6 +465,24 @@ class InterpSection : public Chunk<E> {
   void copy_buf(Context<E> &ctx) override;
 };
 
+enum AbsRelKind {
+  ABS_REL_NONE,
+  ABS_REL_BASEREL,
+  ABS_REL_RELR,
+  ABS_REL_IFUNC,
+  ABS_REL_DYNREL,
+};
+
+// Represents a word-size absolute relocation (e.g. R_X86_64_64)
+template <typename E>
+struct AbsRel {
+  InputSection<E> *isec = nullptr;
+  u64 offset = 0;
+  Symbol<E> *sym = nullptr;
+  i64 addend = 0;
+  AbsRelKind kind = ABS_REL_NONE;
+};
+
 // Sections
 template <typename E>
 class OutputSection : public Chunk<E> {
@@ -478,18 +493,22 @@ class OutputSection : public Chunk<E> {
   }
 
   OutputSection<E> *to_osec() override { return this; }
+  void compute_section_size(Context<E> &ctx) override;
+  i64 get_reldyn_size(Context<E> &ctx) const override;
   void construct_relr(Context<E> &ctx) override;
   void copy_buf(Context<E> &ctx) override;
-  void write_to(Context<E> &ctx, u8 *buf) override;
+  void write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) override;
 
   void compute_symtab_size(Context<E> &ctx) override;
   void populate_symtab(Context<E> &ctx) override;
 
+  void scan_abs_relocations(Context<E> &ctx);
   void create_range_extension_thunks(Context<E> &ctx);
 
   std::vector<InputSection<E> *> members;
   std::vector<std::unique_ptr<Thunk<E>>> thunks;
   std::unique_ptr<RelocSection<E>> reloc_sec;
+  std::vector<AbsRel<E>> abs_rels;
   Atomic<u32> sh_flags;
 };
 
@@ -751,12 +770,10 @@ class DynsymSection : public Chunk<E> {
   }
 
   void add_symbol(Context<E> &ctx, Symbol<E> *sym);
-  void finalize(Context<E> &ctx);
   void update_shdr(Context<E> &ctx) override;
   void copy_buf(Context<E> &ctx) override;
 
   std::vector<Symbol<E> *> symbols;
-  bool finalized = false;
 };
 
 template <typename E>
@@ -793,25 +810,30 @@ class GnuHashSection : public Chunk<E> {
 
   i64 num_buckets = -1;
   i64 num_bloom = 1;
+  i64 num_exported = -1;
 };
 
 template <typename E>
 class MergedSection : public Chunk<E> {
 public:
   static MergedSection<E> *
-  get_instance(Context<E> &ctx, std::string_view name, i64 type, i64 flags,
-               i64 entsize, i64 addralign);
+  get_instance(Context<E> &ctx, std::string_view name, const ElfShdr<E> &shdr);
 
   SectionFragment<E> *insert(Context<E> &ctx, std::string_view data,
                              u64 hash, i64 p2align);
 
-  void assign_offsets(Context<E> &ctx);
+  void resolve(Context<E> &ctx);
+  void compute_section_size(Context<E> &ctx) override;
   void copy_buf(Context<E> &ctx) override;
-  void write_to(Context<E> &ctx, u8 *buf) override;
+  void write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) override;
   void print_stats(Context<E> &ctx);
 
+  std::vector<MergeableSection<E> *> members;
+  std::mutex mu;
+
   ConcurrentMap<SectionFragment<E>> map;
   HyperLogLog estimator;
+  bool resolved = false;
 
 private:
   MergedSection(std::string_view name, i64 flags, i64 type, i64 entsize);
@@ -949,9 +971,8 @@ class BuildIdSection : public Chunk<E> {
 
   void update_shdr(Context<E> &ctx) override;
   void copy_buf(Context<E> &ctx) override;
-  void write_buildid(Context<E> &ctx);
 
-  static constexpr i64 HEADER_SIZE = 16;
+  std::vector<u8> contents;
 };
 
 template <typename E>
@@ -987,6 +1008,22 @@ class NotePropertySection : public Chunk<E> {
   std::map<u32, u32> properties;
 };
 
+template <typename E>
+class GnuDebuglinkSection : public Chunk<E> {
+public:
+  GnuDebuglinkSection() {
+    this->name = ".gnu_debuglink";
+    this->shdr.sh_type = SHT_PROGBITS;
+    this->shdr.sh_addralign = 4;
+  }
+
+  void update_shdr(Context<E> &ctx) override;
+  void copy_buf(Context<E> &ctx) override;
+
+  std::string filename;
+  u32 crc32 = 0;
+};
+
 template <typename E>
 class GdbIndexSection : public Chunk<E> {
 public:
@@ -1056,6 +1093,82 @@ class ComdatGroupSection : public Chunk<E> {
   std::vector<Chunk<E> *> members;
 };
 
+//
+// output-file.cc
+//
+
+template <typename E>
+class OutputFile {
+public:
+  static std::unique_ptr<OutputFile<E>>
+  open(Context<E> &ctx, std::string path, i64 filesize, int perm);
+
+  virtual void close(Context<E> &ctx) = 0;
+  virtual ~OutputFile() = default;
+
+  u8 *buf = nullptr;
+  std::vector<u8> buf2;
+  std::string path;
+  int fd = -1;
+  i64 filesize = 0;
+  bool is_mmapped = false;
+  bool is_unmapped = false;
+
+protected:
+  OutputFile(std::string path, i64 filesize, bool is_mmapped)
+    : path(path), filesize(filesize), is_mmapped(is_mmapped) {}
+};
+
+template <typename E>
+class MallocOutputFile : public OutputFile<E> {
+public:
+  MallocOutputFile(Context<E> &ctx, std::string path, i64 filesize, int perm)
+    : OutputFile<E>(path, filesize, false), ptr(new u8[filesize]),
+      perm(perm) {
+    this->buf = ptr.get();
+  }
+
+  void close(Context<E> &ctx) override {
+    Timer t(ctx, "close_file");
+    FILE *fp;
+
+    if (this->path == "-") {
+      fp = stdout;
+    } else {
+#ifdef _WIN32
+      int pmode = (perm & 0200) ? (_S_IREAD | _S_IWRITE) : _S_IREAD;
+      i64 fd = _open(this->path.c_str(), _O_RDWR | _O_CREAT | _O_BINARY, pmode);
+#else
+      i64 fd = ::open(this->path.c_str(), O_RDWR | O_CREAT, perm);
+#endif
+      if (fd == -1)
+        Fatal(ctx) << "cannot open " << this->path << ": " << errno_string();
+#ifdef _WIN32
+      fp = _fdopen(fd, "wb");
+#else
+      fp = fdopen(fd, "w");
+#endif
+    }
+
+    fwrite(this->buf, this->filesize, 1, fp);
+    if (!this->buf2.empty())
+      fwrite(this->buf2.data(), this->buf2.size(), 1, fp);
+    fclose(fp);
+  }
+
+private:
+  std::unique_ptr<u8[]> ptr;
+  int perm;
+};
+
+template <typename E>
+class LockingOutputFile : public OutputFile<E> {
+public:
+  LockingOutputFile(Context<E> &ctx, std::string path, int perm);
+  void resize(Context<E> &ctx, i64 filesize);
+  void close(Context<E> &ctx) override;
+};
+
 //
 // gdb-index.cc
 //
@@ -1090,15 +1203,23 @@ struct ComdatGroupRef {
 };
 
 template <typename E>
-struct MergeableSection {
+class MergeableSection {
+public:
+  MergeableSection(Context<E> &ctx, MergedSection<E> &parent,
+                   std::unique_ptr<InputSection<E>> &isec);
+
+  void split_contents(Context<E> &ctx);
+  void resolve_contents(Context<E> &ctx);
   std::pair<SectionFragment<E> *, i64> get_fragment(i64 offset);
   std::string_view get_contents(i64 idx);
 
-  MergedSection<E> *parent;
-  std::string_view contents;
+  MergedSection<E> &parent;
+  std::vector<SectionFragment<E> *> fragments;
+
+private:
+  std::unique_ptr<InputSection<E>> section;
   std::vector<u32> frag_offsets;
   std::vector<u32> hashes;
-  std::vector<SectionFragment<E> *> fragments;
   u8 p2align = 0;
 };
 
@@ -1191,8 +1312,9 @@ class ObjectFile : public InputFile<E> {
 
   void parse(Context<E> &ctx);
   void initialize_symbols(Context<E> &ctx);
-  void initialize_mergeable_sections(Context<E> &ctx);
-  void resolve_section_pieces(Context<E> &ctx);
+  void parse_ehframe(Context<E> &ctx);
+  void convert_mergeable_sections(Context<E> &ctx);
+  void reattach_section_pieces(Context<E> &ctx);
   void resolve_symbols(Context<E> &ctx) override;
   void mark_live_objects(Context<E> &ctx,
                          std::function<void(InputFile<E> *)> feeder) override;
@@ -1222,9 +1344,6 @@ class ObjectFile : public InputFile<E> {
   bool is_gcc_offload_obj = false;
   bool is_rust_obj = false;
 
-  i64 num_dynrel = 0;
-  i64 reldyn_offset = 0;
-
   i64 fde_idx = 0;
   i64 fde_offset = 0;
   i64 fde_size = 0;
@@ -1251,7 +1370,6 @@ class ObjectFile : public InputFile<E> {
   void sort_relocations(Context<E> &ctx);
   void initialize_ehframe_sections(Context<E> &ctx);
   void parse_note_gnu_property(Context <E> &ctx, const ElfShdr <E> &shdr);
-  void parse_ehframe(Context<E> &ctx);
   void override_symbol(Context<E> &ctx, Symbol<E> &sym,
                        const ElfSym<E> &esym, i64 symidx);
   void merge_visibility(Context<E> &ctx, Symbol<E> &sym, u8 visibility);
@@ -1272,6 +1390,7 @@ class SharedFile : public InputFile<E> {
   void resolve_symbols(Context<E> &ctx) override;
   std::span<Symbol<E> *> get_symbols_at(Symbol<E> *sym);
   i64 get_alignment(Symbol<E> *sym);
+  std::vector<std::string_view> get_dt_needed(Context<E> &ctx);
   bool is_readonly(Symbol<E> *sym);
 
   void mark_live_objects(Context<E> &ctx,
@@ -1285,10 +1404,11 @@ class SharedFile : public InputFile<E> {
   std::vector<ElfSym<E>> elf_syms2;
 
 private:
-  SharedFile(Context<E> &ctx, MappedFile *mf);
+  SharedFile(Context<E> &ctx, MappedFile *mf) : InputFile<E>(ctx, mf) {}
 
   std::string get_soname(Context<E> &ctx);
   void maybe_override_symbol(Symbol<E> &sym, const ElfSym<E> &esym);
+  std::vector<std::string_view> read_dt_needed(Context<E> &ctx);
   std::vector<std::string_view> read_verdef(Context<E> &ctx);
 
   std::vector<u16> versyms;
@@ -1303,15 +1423,13 @@ class SharedFile : public InputFile<E> {
 // linker-script.cc
 //
 
-template <typename E>
-void parse_linker_script(Context<E> &ctx, MappedFile *mf);
-
-template <typename E>
-std::string_view
-get_script_output_type(Context<E> &ctx, MappedFile *mf);
-
-template <typename E>
-void parse_version_script(Context<E> &ctx, MappedFile *mf);
+struct ReaderContext {
+  bool as_needed = false;
+  bool in_lib = false;
+  bool static_ = false;
+  bool whole_archive = false;
+  tbb::task_group *tg = nullptr;
+};
 
 struct DynamicPattern {
   std::string_view pattern;
@@ -1319,6 +1437,48 @@ struct DynamicPattern {
   bool is_cpp = false;
 };
 
+template <typename E>
+class Script {
+public:
+  Script(Context<E> &ctx, ReaderContext &rctx, MappedFile *mf)
+    : ctx(ctx), rctx(rctx), mf(mf) {}
+
+  std::string_view get_script_output_type();
+  void parse_linker_script();
+  void parse_version_script();
+  std::vector<DynamicPattern> parse_dynamic_list();
+
+private:
+  [[noreturn]] void error(std::string_view pos, std::string msg);
+
+  void tokenize();
+
+  std::span<std::string_view>
+  skip(std::span<std::string_view> tok, std::string_view str);
+
+  std::span<std::string_view> read_output_format(std::span<std::string_view> tok);
+  std::span<std::string_view> read_group(std::span<std::string_view> tok);
+
+  std::span<std::string_view>
+  read_version_script_commands(std::span<std::string_view> tok,
+                               std::string_view ver_str, u16 ver_idx,
+                               bool is_global, bool is_cpp);
+
+  std::span<std::string_view> read_version_script(std::span<std::string_view> tok);
+
+  MappedFile *resolve_path(std::string_view tok, bool check_target);
+
+  std::span<std::string_view>
+  read_dynamic_list_commands(std::span<std::string_view> tok,
+                             std::vector<DynamicPattern> &result, bool is_cpp);
+
+  Context<E> &ctx;
+  ReaderContext &rctx;
+  MappedFile *mf = mf;
+  std::once_flag once;
+  std::vector<std::string_view> tokens;
+};
+
 template <typename E>
 std::vector<DynamicPattern>
 parse_dynamic_list(Context<E> &ctx, std::string_view path);
@@ -1331,11 +1491,25 @@ template <typename E>
 ObjectFile<E> *read_lto_object(Context<E> &ctx, MappedFile *mb);
 
 template <typename E>
-std::vector<ObjectFile<E> *> do_lto(Context<E> &ctx);
+std::vector<ObjectFile<E> *> run_lto_plugin(Context<E> &ctx);
 
 template <typename E>
 void lto_cleanup(Context<E> &ctx);
 
+//
+// shrink-sections.cc
+//
+
+template <typename E>
+void shrink_sections(Context<E> &ctx);
+
+template <typename E>
+void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc);
+
+template <typename E>
+i64 compute_distance(Context<E> &ctx, Symbol<E> &sym,
+                     InputSection<E> &isec, const ElfRel<E> &rel);
+
 //
 // gc-sections.cc
 //
@@ -1368,7 +1542,8 @@ void print_map(Context<E> &ctx);
 // subprocess.cc
 //
 
-std::function<void()> fork_child();
+void fork_child();
+void notify_parent();
 
 template <typename E>
 [[noreturn]]
@@ -1394,11 +1569,10 @@ template <typename E> void apply_exclude_libs(Context<E> &);
 template <typename E> void create_synthetic_sections(Context<E> &);
 template <typename E> void set_file_priority(Context<E> &);
 template <typename E> void resolve_symbols(Context<E> &);
-template <typename E> void kill_eh_frame_sections(Context<E> &);
-template <typename E> void split_section_pieces(Context<E> &);
-template <typename E> void resolve_section_pieces(Context<E> &);
+template <typename E> void do_lto(Context<E> &);
+template <typename E> void parse_eh_frame_sections(Context<E> &);
+template <typename E> void create_merged_sections(Context<E> &);
 template <typename E> void convert_common_symbols(Context<E> &);
-template <typename E> void compute_merged_section_sizes(Context<E> &);
 template <typename E> void create_output_sections(Context<E> &);
 template <typename E> void add_synthetic_symbols(Context<E> &);
 template <typename E> void apply_section_align(Context<E> &);
@@ -1406,6 +1580,7 @@ template <typename E> void check_cet_errors(Context<E> &);
 template <typename E> void print_dependencies(Context<E> &);
 template <typename E> void write_repro_file(Context<E> &);
 template <typename E> void check_duplicate_symbols(Context<E> &);
+template <typename E> void check_shlib_undefined(Context<E> &);
 template <typename E> void check_symbol_types(Context<E> &);
 template <typename E> void sort_init_fini(Context<E> &);
 template <typename E> void sort_ctor_dtor(Context<E> &);
@@ -1417,29 +1592,58 @@ template <typename E> void claim_unresolved_symbols(Context<E> &);
 template <typename E> void scan_relocations(Context<E> &);
 template <typename E> void compute_imported_symbol_weakness(Context<E> &);
 template <typename E> void construct_relr(Context<E> &);
+template <typename E> void sort_dynsyms(Context<E> &);
 template <typename E> void create_output_symtab(Context<E> &);
 template <typename E> void report_undef_errors(Context<E> &);
 template <typename E> void create_reloc_sections(Context<E> &);
 template <typename E> void copy_chunks(Context<E> &);
-template <typename E> void rewrite_endbr(Context<E> &);
 template <typename E> void apply_version_script(Context<E> &);
 template <typename E> void parse_symbol_version(Context<E> &);
 template <typename E> void compute_import_export(Context<E> &);
 template <typename E> void compute_address_significance(Context<E> &);
-template <typename E> void clear_padding(Context<E> &);
+template <typename E> void separate_debug_sections(Context<E> &);
 template <typename E> void compute_section_headers(Context<E> &);
 template <typename E> i64 set_osec_offsets(Context<E> &);
 template <typename E> void fix_synthetic_symbols(Context<E> &);
-template <typename E> i64 compress_debug_sections(Context<E> &);
+template <typename E> void compress_debug_sections(Context<E> &);
+template <typename E> void write_build_id(Context<E> &);
+template <typename E> void write_gnu_debuglink(Context<E> &);
+template <typename E> void write_separate_debug_file(Context<E> &ctx);
 template <typename E> void write_dependency_file(Context<E> &);
 template <typename E> void show_stats(Context<E> &);
 
+//
+// arch-x86-64.cc
+//
+
+void rewrite_endbr(Context<X86_64> &ctx);
+
 //
 // arch-arm32.cc
 //
 
+class Arm32ExidxSection : public Chunk<ARM32> {
+public:
+  Arm32ExidxSection(OutputSection<ARM32> &osec) : output_section(osec) {
+    this->name = ".ARM.exidx";
+    this->shdr.sh_type = SHT_ARM_EXIDX;
+    this->shdr.sh_flags = SHF_ALLOC;
+    this->shdr.sh_addralign = 4;
+  }
+
+  void compute_section_size(Context<ARM32> &ctx) override;
+  void update_shdr(Context<ARM32> &ctx) override;
+  void remove_duplicate_entries(Context<ARM32> &ctx);
+  void copy_buf(Context<ARM32> &ctx) override;
+
+private:
+  std::vector<u8> get_contents(Context<ARM32> &ctx);
+
+  OutputSection<ARM32> &output_section;
+};
+
 template <> u64 get_eflags(Context<ARM32> &ctx);
-void fixup_arm_exidx_section(Context<ARM32> &ctx);
+void create_arm_exidx_section(Context<ARM32> &ctx);
 
 //
 // arch-riscv.cc
@@ -1462,9 +1666,6 @@ class RiscvAttributesSection : public Chunk<E> {
 template <is_riscv E>
 u64 get_eflags(Context<E> &ctx);
 
-template <is_riscv E>
-i64 riscv_resize_sections(Context<E> &ctx);
-
 //
 // arch-ppc64v1.cc
 //
@@ -1512,60 +1713,23 @@ class PPC64SaveRestoreSection : public Chunk<PPC64V2> {
 
 template <> u64 get_eflags(Context<PPC64V2> &ctx);
 
-//
-// arch-sparc.cc
-//
-
-class SparcTlsGetAddrSection : public Chunk<SPARC64> {
-public:
-  SparcTlsGetAddrSection() {
-    this->name = ".tls_get_addr";
-    this->shdr.sh_type = SHT_PROGBITS;
-    this->shdr.sh_flags = SHF_ALLOC | SHF_EXECINSTR;
-    this->shdr.sh_addralign = 4;
-    this->shdr.sh_size = 24;
-  }
-
-  void copy_buf(Context<SPARC64> &ctx) override;
-};
-
-//
-// arch-alpha.cc
-//
-
-class AlphaGotSection : public Chunk<ALPHA> {
-public:
-  AlphaGotSection() {
-    this->name = ".alpha_got";
-    this->is_relro = true;
-    this->shdr.sh_type = SHT_PROGBITS;
-    this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE;
-    this->shdr.sh_addralign = 8;
-  }
-
-  void add_symbol(Symbol<ALPHA> &sym, i64 addend);
-  void finalize();
-  u64 get_addr(Symbol<ALPHA> &sym, i64 addend);
-  i64 get_reldyn_size(Context<ALPHA> &ctx) const override;
-  void copy_buf(Context<ALPHA> &ctx) override;
-
-  struct Entry {
-    bool operator==(const Entry &) const = default;
-    Symbol<ALPHA> *sym;
-    i64 addend;
-  };
-
-private:
-  std::vector<Entry> entries;
-  std::mutex mu;
-};
-
 //
 // main.cc
 //
 
 struct BuildId {
-  i64 size() const;
+  i64 size() const {
+    switch (kind) {
+    case HEX:
+      return value.size();
+    case HASH:
+      return hash_size;
+    case UUID:
+      return 16;
+    default:
+      unreachable();
+    }
+  }
 
   enum { NONE, HEX, HASH, UUID } kind = NONE;
   std::vector<u8> value;
@@ -1624,6 +1788,11 @@ struct SectionOrder {
 template <typename E>
 struct ContextExtras {};
 
+template <>
+struct ContextExtras<ARM32> {
+  Arm32ExidxSection *exidx = nullptr;
+};
+
 template <is_riscv E>
 struct ContextExtras<E> {
   RiscvAttributesSection<E> *riscv_attributes = nullptr;
@@ -1649,13 +1818,7 @@ struct ContextExtras<PPC64V2> {
 
 template <>
 struct ContextExtras<SPARC64> {
-  SparcTlsGetAddrSection *tls_get_addr_sec = nullptr;
-  Symbol<SPARC64> *tls_get_addr_sym = nullptr;
-};
-
-template <>
-struct ContextExtras<ALPHA> {
-  AlphaGotSection *got = nullptr;
+  Symbol<SPARC64> *tls_get_addr = nullptr;
 };
 
 // Context represents a context object for each invocation of the linker.
@@ -1668,6 +1831,9 @@ struct Context {
     arg.entry = get_symbol(*this, "_start");
     arg.fini = get_symbol(*this, "_fini");
     arg.init = get_symbol(*this, "_init");
+
+    if constexpr (is_sparc<E>)
+      extra.tls_get_addr = get_symbol(*this, "__tls_get_addr");
   }
 
   Context(const Context<E> &) = delete;
@@ -1681,6 +1847,7 @@ struct Context {
 
   // Command-line arguments
   struct {
+    BsymbolicKind Bsymbolic = BSYMBOLIC_NONE;
     BuildId build_id;
     CetReportKind z_cet_report = CET_REPORT_NONE;
     CompressKind compress_debug_sections = COMPRESS_NONE;
@@ -1691,14 +1858,16 @@ struct Context {
     Symbol<E> *fini = nullptr;
     Symbol<E> *init = nullptr;
     UnresolvedKind unresolved_symbols = UNRESOLVED_IGNORE;
-    BsymbolicKind Bsymbolic = BSYMBOLIC_NONE;
     bool allow_multiple_definition = false;
+    bool allow_shlib_undefined = true;
     bool apply_dynamic_relocs = true;
     bool color_diagnostics = false;
     bool default_symver = false;
     bool demangle = true;
+    bool detach = true;
     bool discard_all = false;
     bool discard_locals = false;
+    bool dynamic_list_data = false;
     bool eh_frame_hdr = true;
     bool emit_relocs = false;
     bool enable_new_dtags = true;
@@ -1713,7 +1882,6 @@ struct Context {
     bool icf = false;
     bool icf_all = false;
     bool ignore_data_address_equality = false;
-    bool is_static = false;
     bool lto_pass2 = false;
     bool nmagic = false;
     bool noinhibit_exec = false;
@@ -1735,6 +1903,7 @@ struct Context {
     bool rosegment = true;
     bool shared = false;
     bool start_stop = false;
+    bool static_ = false;
     bool stats = false;
     bool strip_all = false;
     bool strip_debug = false;
@@ -1770,8 +1939,6 @@ struct Context {
     i64 spare_program_headers = 0;
     i64 thread_count = 0;
     i64 z_stack_size = 0;
-    u64 shuffle_sections_seed;
-    std::string_view emulation;
     std::optional<Glob> unique;
     std::optional<u64> physical_image_base;
     std::string Map;
@@ -1783,9 +1950,11 @@ struct Context {
     std::string package_metadata;
     std::string plugin;
     std::string rpaths;
+    std::string separate_debug_file;
     std::string soname;
     std::string sysroot;
-    std::unique_ptr<std::unordered_set<std::string_view>> retain_symbols_file;
+    std::string_view emulation;
+    std::optional<std::vector<Symbol<E> *>> retain_symbols_file;
     std::unordered_map<std::string_view, u64> section_align;
     std::unordered_map<std::string_view, u64> section_start;
     std::unordered_set<std::string_view> ignore_ir_file;
@@ -1801,25 +1970,19 @@ struct Context {
     std::vector<std::string_view> exclude_libs;
     std::vector<std::string_view> filter;
     std::vector<std::string_view> trace_symbol;
+    u32 z_x86_64_isa_level = 0;
     u64 image_base = 0x200000;
+    u64 shuffle_sections_seed = 0;
   } arg;
 
   std::vector<VersionPattern> version_patterns;
   std::vector<DynamicPattern> dynamic_list_patterns;
   i64 default_version = VER_NDX_UNSPECIFIED;
   i64 page_size = E::page_size;
+  bool has_error = false;
 
   // Reader context
-  bool as_needed = false;
-  bool whole_archive = false;
-  bool is_static;
-  bool in_lib = false;
   i64 file_priority = 10000;
-  MappedFile *script_file = nullptr;
-  std::unordered_set<std::string_view> visited;
-  tbb::task_group tg;
-
-  bool has_error = false;
 
   // Symbol table
   tbb::concurrent_hash_map<std::string_view, Symbol<E>, HashCmp> symbol_map;
@@ -1850,7 +2013,7 @@ struct Context {
   std::vector<ElfSym<E>> internal_esyms;
 
   // Output buffer
-  std::unique_ptr<OutputFile<Context<E>>> output_file;
+  std::unique_ptr<OutputFile<E>> output_file;
   u8 *buf = nullptr;
   bool overwrite_output_file = true;
 
@@ -1861,6 +2024,9 @@ struct Context {
 
   tbb::concurrent_hash_map<Symbol<E> *, std::vector<std::string>> undef_errors;
 
+  // For --separate-debug-file
+  std::vector<Chunk<E> *> debug_chunks;
+
   // Output chunks
   OutputEhdr<E> *ehdr = nullptr;
   OutputShdr<E> *shdr = nullptr;
@@ -1876,6 +2042,7 @@ struct Context {
   DynstrSection<E> *dynstr = nullptr;
   HashSection<E> *hash = nullptr;
   GnuHashSection<E> *gnu_hash = nullptr;
+  GnuDebuglinkSection<E> *gnu_debuglink = nullptr;
   ShstrtabSection<E> *shstrtab = nullptr;
   PltSection<E> *plt = nullptr;
   PltGotSection<E> *pltgot = nullptr;
@@ -1895,6 +2062,7 @@ struct Context {
   NotePropertySection<E> *note_property = nullptr;
   GdbIndexSection<E> *gdb_index = nullptr;
   RelroPaddingSection<E> *relro_padding = nullptr;
+  MergedSection<E> *comment = nullptr;
 
   [[no_unique_address]] ContextExtras<E> extra;
 
@@ -1940,21 +2108,20 @@ struct Context {
 };
 
 template <typename E>
-std::string_view get_machine_type(Context<E> &ctx, MappedFile *mf);
+std::string_view
+get_machine_type(Context<E> &ctx, ReaderContext &rctx, MappedFile *mf);
 
 template <typename E>
-MappedFile *open_library(Context<E> &ctx, std::string path);
+MappedFile *open_library(Context<E> &ctx, ReaderContext &rctx, std::string path);
 
 template <typename E>
-MappedFile *find_library(Context<E> &ctx, std::string path);
+MappedFile *find_library(Context<E> &ctx, ReaderContext &rctx, std::string path);
 
 template <typename E>
-void read_file(Context<E> &ctx, MappedFile *mf);
+void read_file(Context<E> &ctx, ReaderContext &rctx, MappedFile *mf);
 
 template <typename E>
-int elf_main(int argc, char **argv);
-
-int main(int argc, char **argv);
+int mold_main(int argc, char **argv);
 
 template <typename E>
 std::ostream &operator<<(std::ostream &out, const InputFile<E> &file);
@@ -2089,6 +2256,10 @@ class Symbol {
     TAG_MASK = 0b11,
   };
 
+  // We want to make sure there are enough number of unused bits in
+  // pointers referring to these structures. In particular, we need
+  // __attribute__((aligned(4))) for m68k on which int, long, float
+  // and double are aligned only to two byte boundaries.
   static_assert(alignof(InputSection<E>) >= 4);
   static_assert(alignof(Chunk<E>) >= 4);
   static_assert(alignof(SectionFragment<E>) >= 4);
@@ -2225,6 +2396,10 @@ class Symbol {
   bool has_copyrel : 1 = false;
   bool is_copyrel_readonly : 1 = false;
 
+  // For symbol resolution. This flag is used rarely. See a comment in
+  // resolve_symbols().
+  bool skip_dso : 1 = false;
+
   // For --gc-sections
   bool gc_root : 1 = false;
 
@@ -2337,12 +2512,19 @@ InputSection<E>::get_fragment(Context<E> &ctx, const ElfRel<E> &rel) {
   assert(!(shdr().sh_flags & SHF_ALLOC));
 
   const ElfSym<E> &esym = file.elf_syms[rel.r_sym];
+  if (esym.is_abs() || esym.is_common() || esym.is_undef())
+    return {nullptr, 0};
+
+  i64 shndx = file.get_shndx(esym);
+  std::unique_ptr<MergeableSection<E>> &m = file.mergeable_sections[shndx];
+  if (!m)
+    return {nullptr, 0};
+
   if (esym.st_type == STT_SECTION)
-    if (std::unique_ptr<MergeableSection<E>> &m =
-        file.mergeable_sections[file.get_shndx(esym)])
-      return m->get_fragment(esym.st_value + get_addend(*this, rel));
+    return m->get_fragment(esym.st_value + get_addend(*this, rel));
 
-  return {nullptr, 0};
+  std::pair<SectionFragment<E> *, i64> p = m->get_fragment(esym.st_value);
+  return {p.first, p.second + get_addend(*this, rel)};
 }
 
 template <typename E>
@@ -2377,31 +2559,31 @@ InputSection<E>::get_tombstone(Symbol<E> &sym, SectionFragment<E> *frag) {
   if (!isec || isec->is_alive)
     return {};
 
-  std::string_view s = name();
-  if (!s.starts_with(".debug"))
+  std::string_view str = name();
+  if (!str.starts_with(".debug"))
     return {};
 
   // If the section was dead due to ICF, we don't want to emit debug
   // info for that section but want to set real values to .debug_line so
   // that users can set a breakpoint inside a merged section.
-  if (isec->is_killed_by_icf() && s == ".debug_line")
+  if (isec->icf_removed() && str == ".debug_line")
     return {};
 
   // 0 is an invalid value in most debug info sections, so we use it
   // as a tombstone value. .debug_loc and .debug_ranges reserve 0 as
-  // the terminator marker, so we use 1 if that's the case.
-  return (s == ".debug_loc" || s == ".debug_ranges") ? 1 : 0;
+  // the terminator marker, so we use 1 if that'str the case.
+  return (str == ".debug_loc" || str == ".debug_ranges") ? 1 : 0;
 }
 
 template <typename E>
-inline bool InputSection<E>::is_killed_by_icf() const {
+inline bool InputSection<E>::icf_removed() const {
   return this->leader && this->leader != this;
 }
 
 template <typename E>
 std::pair<SectionFragment<E> *, i64>
 MergeableSection<E>::get_fragment(i64 offset) {
-  std::vector<u32> &vec = frag_offsets;
+  std::span<u32> vec = frag_offsets;
   auto it = std::upper_bound(vec.begin(), vec.end(), offset);
   i64 idx = it - 1 - vec.begin();
   return {fragments[idx], offset - vec[idx]};
@@ -2411,8 +2593,8 @@ template <typename E>
 std::string_view MergeableSection<E>::get_contents(i64 i) {
   i64 cur = frag_offsets[i];
   if (i == frag_offsets.size() - 1)
-    return contents.substr(cur);
-  return contents.substr(cur, frag_offsets[i + 1] - cur);
+    return section->contents.substr(cur);
+  return section->contents.substr(cur, frag_offsets[i + 1] - cur);
 }
 
 template <typename E>
@@ -2462,6 +2644,8 @@ inline i64 ObjectFile<E>::get_shndx(const ElfSym<E> &esym) {
 
   if (esym.st_shndx == SHN_XINDEX)
     return symtab_shndx_sec[&esym - &this->elf_syms[0]];
+  if (esym.st_shndx >= SHN_LORESERVE)
+    return 0;
   return esym.st_shndx;
 }
 
@@ -2504,7 +2688,7 @@ u64 Symbol<E>::get_addr(Context<E> &ctx, i64 flags) const {
     return value; // absolute symbol
 
   if (!isec->is_alive) {
-    if (isec->is_killed_by_icf())
+    if (isec->icf_removed())
       return isec->leader->get_addr() + value;
 
     if (isec->name() == ".eh_frame") {
@@ -2898,4 +3082,13 @@ inline bool is_c_identifier(std::string_view s) {
   return true;
 }
 
-} // namespace mold::elf
+template <typename E>
+std::string_view save_string(Context<E> &ctx, const std::string &str) {
+  u8 *buf = new u8[str.size() + 1];
+  memcpy(buf, str.data(), str.size());
+  buf[str.size()] = '\0';
+  ctx.string_pool.push_back(std::unique_ptr<u8[]>(buf));
+  return {(char *)buf, str.size()};
+}
+
+} // namespace mold
diff --git a/elf/output-chunks.cc b/src/output-chunks.cc
similarity index 86%
rename from elf/output-chunks.cc
rename to src/output-chunks.cc
index 4337653d..149859ab 100644
--- a/elf/output-chunks.cc
+++ b/src/output-chunks.cc
@@ -1,7 +1,5 @@
 #include "mold.h"
-
 #include "config.h"
-#include "blake3.h"
 
 #include <cctype>
 #include <set>
@@ -11,7 +9,7 @@
 #include <tbb/parallel_scan.h>
 #include <tbb/parallel_sort.h>
 
-namespace mold::elf {
+namespace mold {
 
 // The hash function for .hash.
 static u32 elf_hash(std::string_view name) {
@@ -26,29 +24,19 @@ static u32 elf_hash(std::string_view name) {
   return h;
 }
 
-// The hash function for .gnu.hash.
-static u32 djb_hash(std::string_view name) {
-  u32 h = 5381;
-  for (u8 c : name)
-    h = (h << 5) + h + c;
-  return h;
-}
-
 template <typename E>
-OutputSection<E> *find_section(Context<E> &ctx, u32 sh_type) {
+Chunk<E> *find_chunk(Context<E> &ctx, u32 sh_type) {
   for (Chunk<E> *chunk : ctx.chunks)
-    if (OutputSection<E> *osec = chunk->to_osec())
-      if (osec->shdr.sh_type == sh_type)
-        return osec;
+    if (chunk->shdr.sh_type == sh_type)
+      return chunk;
   return nullptr;
 }
 
 template <typename E>
-OutputSection<E> *find_section(Context<E> &ctx, std::string_view name) {
+Chunk<E> *find_chunk(Context<E> &ctx, std::string_view name) {
   for (Chunk<E> *chunk : ctx.chunks)
-    if (OutputSection<E> *osec = chunk->to_osec())
-      if (osec->name == name)
-        return osec;
+    if (chunk->name == name)
+      return chunk;
   return nullptr;
 }
 
@@ -165,10 +153,17 @@ static std::vector<ElfPhdr<E>> create_phdr(Context<E> &ctx) {
     phdr.p_type = type;
     phdr.p_flags = flags;
     phdr.p_align = chunk->shdr.sh_addralign;
-    phdr.p_offset = chunk->shdr.sh_offset;
 
-    if (chunk->shdr.sh_type != SHT_NOBITS)
+    if (chunk->shdr.sh_type == SHT_NOBITS) {
+      // p_offset indicates the in-file start offset and is not
+      // significant for segments with zero on-file size. We still want to
+      // keep it congruent with the virtual address modulo page size
+      // because some loaders (at least FreeBSD's) are picky about it.
+      phdr.p_offset = chunk->shdr.sh_addr % ctx.page_size;
+    } else {
+      phdr.p_offset = chunk->shdr.sh_offset;
       phdr.p_filesz = chunk->shdr.sh_size;
+    }
 
     phdr.p_vaddr = chunk->shdr.sh_addr;
     phdr.p_paddr = chunk->shdr.sh_addr;
@@ -277,6 +272,10 @@ static std::vector<ElfPhdr<E>> create_phdr(Context<E> &ctx) {
   if (ctx.eh_frame_hdr)
     define(PT_GNU_EH_FRAME, PF_R, ctx.eh_frame_hdr);
 
+  // Add PT_GNU_PROPERTY
+  if (Chunk<E> *chunk = find_chunk(ctx, ".note.gnu.property"))
+    define(PT_GNU_PROPERTY, PF_R, chunk);
+
   // Add PT_GNU_STACK, which is a marker segment that doesn't really
   // contain any segments. It controls executable bit of stack area.
   {
@@ -303,8 +302,8 @@ static std::vector<ElfPhdr<E>> create_phdr(Context<E> &ctx) {
 
   // Create a PT_ARM_EDXIDX
   if constexpr (is_arm32<E>)
-    if (OutputSection<E> *osec = find_section(ctx, SHT_ARM_EXIDX))
-      define(PT_ARM_EXIDX, PF_R, osec);
+    if (ctx.extra.exidx)
+      define(PT_ARM_EXIDX, PF_R, ctx.extra.exidx);
 
   // Create a PT_RISCV_ATTRIBUTES
   if constexpr (is_riscv<E>)
@@ -371,9 +370,14 @@ void OutputPhdr<E>::update_shdr(Context<E> &ctx) {
   phdrs = create_phdr(ctx);
   this->shdr.sh_size = phdrs.size() * sizeof(ElfPhdr<E>);
 
-  ctx.tls_begin = get_tls_begin(ctx);
-  ctx.tp_addr = get_tp_addr(ctx);
-  ctx.dtp_addr = get_dtp_addr(ctx);
+  for (ElfPhdr<E> &phdr : phdrs) {
+    if (phdr.p_type == PT_TLS) {
+      ctx.tls_begin = phdr.p_vaddr;
+      ctx.tp_addr = get_tp_addr(phdr);
+      ctx.dtp_addr = get_dtp_addr(phdr);
+      break;
+    }
+  }
 }
 
 template <typename E>
@@ -400,11 +404,6 @@ void RelDynSection<E>::update_shdr(Context<E> &ctx) {
     offset += chunk->get_reldyn_size(ctx) * sizeof(ElfRel<E>);
   }
 
-  for (ObjectFile<E> *file : ctx.objs) {
-    file->reldyn_offset = offset;
-    offset += file->num_dynrel * sizeof(ElfRel<E>);
-  }
-
   this->shdr.sh_size = offset;
   this->shdr.sh_link = ctx.dynsym->shndx;
 }
@@ -478,7 +477,7 @@ void StrtabSection<E>::update_shdr(Context<E> &ctx) {
   // affect correctness of the program but helps disassembler to
   // disassemble machine code appropriately.
   if constexpr (is_arm32<E>)
-    if (!ctx.arg.strip_all && !ctx.arg.retain_symbols_file)
+    if (!ctx.arg.strip_all)
       offset += sizeof("$a\0$t\0$d");
 
   for (Chunk<E> *chunk : ctx.chunks) {
@@ -505,7 +504,7 @@ void StrtabSection<E>::copy_buf(Context<E> &ctx) {
   buf[0] = '\0';
 
   if constexpr (is_arm32<E>)
-    if (!ctx.arg.strip_all && !ctx.arg.retain_symbols_file)
+    if (!ctx.arg.strip_all)
       memcpy(buf + 1, "$a\0$t\0$d", 9);
 }
 
@@ -739,19 +738,19 @@ static std::vector<Word<E>> create_dynamic_section(Context<E> &ctx) {
     define(DT_STRSZ, ctx.dynstr->shdr.sh_size);
   }
 
-  if (find_section(ctx, SHT_INIT_ARRAY)) {
+  if (find_chunk(ctx, SHT_INIT_ARRAY)) {
     define(DT_INIT_ARRAY, ctx.__init_array_start->value);
     define(DT_INIT_ARRAYSZ,
            ctx.__init_array_end->value - ctx.__init_array_start->value);
   }
 
-  if (find_section(ctx, SHT_PREINIT_ARRAY)) {
+  if (find_chunk(ctx, SHT_PREINIT_ARRAY)) {
     define(DT_PREINIT_ARRAY, ctx.__preinit_array_start->value);
     define(DT_PREINIT_ARRAYSZ,
            ctx.__preinit_array_end->value - ctx.__preinit_array_start->value);
   }
 
-  if (find_section(ctx, SHT_FINI_ARRAY)) {
+  if (find_chunk(ctx, SHT_FINI_ARRAY)) {
     define(DT_FINI_ARRAY, ctx.__fini_array_start->value);
     define(DT_FINI_ARRAYSZ,
            ctx.__fini_array_end->value - ctx.__fini_array_start->value);
@@ -853,7 +852,7 @@ static std::vector<Word<E>> create_dynamic_section(Context<E> &ctx) {
 
 template <typename E>
 void DynamicSection<E>::update_shdr(Context<E> &ctx) {
-  if (ctx.arg.is_static && !ctx.arg.pie)
+  if (ctx.arg.static_ && !ctx.arg.pie)
     return;
 
   this->shdr.sh_size = create_dynamic_section(ctx).size() * sizeof(Word<E>);
@@ -863,20 +862,104 @@ void DynamicSection<E>::update_shdr(Context<E> &ctx) {
 template <typename E>
 void DynamicSection<E>::copy_buf(Context<E> &ctx) {
   std::vector<Word<E>> contents = create_dynamic_section(ctx);
-  assert(this->shdr.sh_size == contents.size() * sizeof(contents[0]));
+  assert(this->shdr.sh_size == contents.size() * sizeof(Word<E>));
   write_vector(ctx.buf + this->shdr.sh_offset, contents);
 }
 
+template <typename T>
+static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
+  std::span<T> span(input);
+  std::vector<std::span<T>> vec;
+
+  while (span.size() >= unit) {
+    vec.push_back(span.subspan(0, unit));
+    span = span.subspan(unit);
+  }
+  if (!span.empty())
+    vec.push_back(span);
+  return vec;
+}
+
+
+// Assign offsets to OutputSection members
+template <typename E>
+void OutputSection<E>::compute_section_size(Context<E> &ctx) {
+  ElfShdr<E> &shdr = this->shdr;
+
+  // On most RISC systems, we need to create so-called "range extension
+  // thunks" to extend branch instructions reach, as their jump
+  // instructions' reach is limited. create_range_extension_thunks()
+  // computes the size of the section while inserting thunks.
+  if constexpr (needs_thunk<E>) {
+    if ((shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) {
+      create_range_extension_thunks(ctx);
+      return;
+    }
+  }
+
+  // Since one output section may contain millions of input sections,
+  // we first split input sections into groups and assign offsets to
+  // groups.
+  struct Group {
+    std::span<InputSection<E> *> members;
+    i64 size = 0;
+    i64 p2align = 0;
+    i64 offset = 0;
+  };
+
+  std::span<InputSection<E> *> mem = members;
+  std::vector<Group> groups;
+  constexpr i64 group_size = 10000;
+
+  while (!mem.empty()) {
+    i64 sz = std::min<i64>(group_size, mem.size());
+    groups.push_back({mem.subspan(0, sz)});
+    mem = mem.subspan(sz);
+  }
+
+  tbb::parallel_for_each(groups, [](Group &group) {
+    for (InputSection<E> *isec : group.members) {
+      group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
+      group.p2align = std::max<i64>(group.p2align, isec->p2align);
+    }
+  });
+
+  shdr.sh_size = 0;
+
+  for (i64 i = 0; i < groups.size(); i++) {
+    shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
+    groups[i].offset = shdr.sh_size;
+    shdr.sh_size += groups[i].size;
+    shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
+  }
+
+  // Assign offsets to input sections.
+  tbb::parallel_for_each(groups, [](Group &group) {
+    i64 offset = group.offset;
+    for (InputSection<E> *isec : group.members) {
+      offset = align_to(offset, 1 << isec->p2align);
+      isec->offset = offset;
+      offset += isec->sh_size;
+    }
+  });
+}
+
 template <typename E>
 void OutputSection<E>::copy_buf(Context<E> &ctx) {
-  if (this->shdr.sh_type != SHT_NOBITS)
-    write_to(ctx, ctx.buf + this->shdr.sh_offset);
+  if (this->shdr.sh_type != SHT_NOBITS) {
+    ElfRel<E> *rel = nullptr;
+    if (ctx.reldyn)
+      rel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                          this->reldyn_offset);
+
+    write_to(ctx, ctx.buf + this->shdr.sh_offset, rel);
+  }
 }
 
 template <typename E>
-void OutputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
+void OutputSection<E>::write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) {
+  // Copy section contents to an output file.
   tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) {
-    // Copy section contents to an output file.
     InputSection<E> &isec = *members[i];
     isec.write_to(ctx, buf + isec.offset);
 
@@ -901,11 +984,46 @@ void OutputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
     }
   });
 
+  // Emit range extension thunks.
   if constexpr (needs_thunk<E>) {
     tbb::parallel_for_each(thunks, [&](std::unique_ptr<Thunk<E>> &thunk) {
       thunk->copy_buf(ctx);
     });
   }
+
+  // Emit dynamic relocations.
+  for (AbsRel<E> &r : abs_rels) {
+    Word<E> *loc = (Word<E> *)(buf + r.isec->offset + r.offset);
+    u64 addr = this->shdr.sh_addr + r.isec->offset + r.offset;
+    Symbol<E> &sym = *r.sym;
+
+    switch (r.kind) {
+    case ABS_REL_NONE:
+    case ABS_REL_RELR:
+      *loc = sym.get_addr(ctx) + r.addend;
+      break;
+    case ABS_REL_BASEREL: {
+      u64 val = sym.get_addr(ctx) + r.addend;
+      *rel++ = ElfRel<E>(addr, E::R_RELATIVE, 0, val);
+      if (ctx.arg.apply_dynamic_relocs)
+        *loc = val;
+      break;
+    }
+    case ABS_REL_IFUNC:
+      if constexpr (supports_ifunc<E>) {
+        u64 val = sym.get_addr(ctx, NO_PLT) + r.addend;
+        *rel++ = ElfRel<E>(addr, E::R_IRELATIVE, 0, val);
+        if (ctx.arg.apply_dynamic_relocs)
+          *loc = val;
+      }
+      break;
+    case ABS_REL_DYNREL:
+      *rel++ = ElfRel<E>(addr, E::R_ABS, sym.get_dynsym_idx(ctx), r.addend);
+      if (ctx.arg.apply_dynamic_relocs)
+        *loc = r.addend;
+      break;
+    }
+  }
 }
 
 // .relr.dyn contains base relocations encoded in a space-efficient form.
@@ -927,25 +1045,24 @@ void OutputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
 // the .rel.dyn section). A bitmap has LSB 1.
 template <typename E>
 static std::vector<u64> encode_relr(std::span<u64> pos) {
+  for (i64 i = 0; i < pos.size(); i++) {
+    assert(pos[i] % sizeof(Word<E>) == 0);
+    assert(i == 0 || pos[i - 1] < pos[i]);
+  }
+
   std::vector<u64> vec;
   i64 num_bits = E::is_64 ? 63 : 31;
   i64 max_delta = sizeof(Word<E>) * num_bits;
 
   for (i64 i = 0; i < pos.size();) {
-    assert(i == 0 || pos[i - 1] < pos[i]);
-    assert(pos[i] % sizeof(Word<E>) == 0);
-
     vec.push_back(pos[i]);
     u64 base = pos[i] + sizeof(Word<E>);
     i++;
 
     for (;;) {
       u64 bits = 0;
-      for (; i < pos.size() && pos[i] - base < max_delta; i++) {
-        assert(pos[i - 1] < pos[i]);
-        assert(pos[i] % sizeof(Word<E>) == 0);
+      for (; i < pos.size() && pos[i] - base < max_delta; i++)
         bits |= (u64)1 << ((pos[i] - base) / sizeof(Word<E>));
-      }
 
       if (!bits)
         break;
@@ -958,35 +1075,92 @@ static std::vector<u64> encode_relr(std::span<u64> pos) {
 }
 
 template <typename E>
-void OutputSection<E>::construct_relr(Context<E> &ctx) {
-  if (!ctx.arg.pic)
-    return;
-  if (!(this->shdr.sh_flags & SHF_ALLOC))
-    return;
-  if (this->shdr.sh_addralign % sizeof(Word<E>))
-    return;
+static AbsRelKind get_abs_rel_kind(Context<E> &ctx, Symbol<E> &sym) {
+  if (sym.is_ifunc())
+    return sym.is_pde_ifunc(ctx) ? ABS_REL_NONE : ABS_REL_IFUNC;
 
-  // Skip it if it is a text section because .text doesn't usually
-  // contain any dynamic relocations.
-  if (this->shdr.sh_flags & SHF_EXECINSTR)
-    return;
+  if (sym.is_absolute())
+    return ABS_REL_NONE;
 
-  // Collect base relocations
-  std::vector<std::vector<u64>> shards(members.size());
+  // True if the symbol's address is in the output file.
+  if (!sym.is_imported || (sym.flags & NEEDS_CPLT) || (sym.flags & NEEDS_COPYREL))
+    return ctx.arg.pic ? ABS_REL_BASEREL : ABS_REL_NONE;
 
-  tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) {
-    InputSection<E> &isec = *members[i];
+  return ABS_REL_DYNREL;
+}
 
-    if (isec.shdr().sh_addralign % sizeof(Word<E>) == 0)
-      for (const ElfRel<E> &r : isec.get_rels(ctx))
-        if (r.r_type == E::R_ABS && r.r_offset % sizeof(Word<E>) == 0)
-          if (Symbol<E> &sym = *isec.file.symbols[r.r_sym];
-              !sym.is_ifunc() && !sym.is_absolute() && !sym.is_imported)
-            shards[i].push_back(isec.offset + r.r_offset);
+// Scan word-size absolute relocations (e.g. R_X86_64_64). This is
+// separated from scan_relocations() because only such relocations can
+// be promoted to dynamic relocations.
+template <typename E>
+void OutputSection<E>::scan_abs_relocations(Context<E> &ctx) {
+  std::vector<std::vector<AbsRel<E>>> shards(members.size());
+
+  // Collect all word-size absolute relocations
+  tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) {
+    InputSection<E> *isec = members[i];
+    for (const ElfRel<E> &r : isec->get_rels(ctx))
+      if (r.r_type == E::R_ABS)
+        shards[i].push_back(AbsRel<E>{isec, r.r_offset, isec->file.symbols[r.r_sym],
+                                      get_addend(*isec, r)});
   });
 
-  // Compress them
-  std::vector<u64> pos = flatten(shards);
+  abs_rels = flatten(shards);
+
+  // We can sometimes avoid creating dynamic relocations in read-only
+  // sections by promoting symbols to canonical PLT or copy relocations.
+  if (!ctx.arg.pic && !(this->shdr.sh_flags & SHF_WRITE))
+    for (AbsRel<E> &r : abs_rels)
+      if (Symbol<E> &sym = *r.sym;
+          sym.is_imported && !sym.is_absolute())
+        sym.flags |= (sym.get_type() == STT_FUNC) ? NEEDS_CPLT : NEEDS_COPYREL;
+
+  // Now we can compute whether they need to be promoted to dynamic
+  // relocations or not.
+  for (AbsRel<E> &r : abs_rels)
+    r.kind = get_abs_rel_kind(ctx, *r.sym);
+
+  // If we have a relocation against a read-only section, we need to
+  // set the DT_TEXTREL flag for the loader.
+  for (AbsRel<E> &r : abs_rels) {
+    if (r.kind != ABS_REL_NONE && !(r.isec->shdr().sh_flags & SHF_WRITE)) {
+      if (ctx.arg.z_text) {
+        Error(ctx) << *r.isec << ": relocation at offset 0x"
+                   << std::hex << r.offset << " against symbol `"
+                   << *r.sym << "' can not be used; recompile with -fPIC";
+      } else if (ctx.arg.warn_textrel) {
+        Warn(ctx) << *r.isec << ": relocation against symbol `" << *r.sym
+                  << "' in read-only section";
+      }
+      ctx.has_textrel = true;
+    }
+  }
+
+  // If --pack-dyn-relocs=relr is enabled, base relocations are put into
+  // .relr.dyn.
+  if (ctx.arg.pack_dyn_relocs_relr)
+    for (AbsRel<E> &r : abs_rels)
+      if (r.kind == ABS_REL_BASEREL &&
+          r.isec->shdr().sh_addralign % sizeof(Word<E>) == 0 &&
+          r.offset % sizeof(Word<E>) == 0)
+        r.kind = ABS_REL_RELR;
+}
+
+template <typename E>
+i64 OutputSection<E>::get_reldyn_size(Context<E> &ctx) const {
+  i64 n = 0;
+  for (const AbsRel<E> &r : abs_rels)
+    if (r.kind != ABS_REL_NONE && r.kind != ABS_REL_RELR)
+      n++;
+  return n;
+}
+
+template <typename E>
+void OutputSection<E>::construct_relr(Context<E> &ctx) {
+  std::vector<u64> pos;
+  for (const AbsRel<E> &r : abs_rels)
+    if (r.kind == ABS_REL_RELR)
+      pos.push_back(r.isec->offset + r.offset);
   this->relr = encode_relr<E>(pos);
 }
 
@@ -1091,7 +1265,7 @@ void GotSection<E>::add_tlsdesc_symbol(Context<E> &ctx, Symbol<E> *sym) {
   // statically-linked executable), we always relax TLSDESC relocations
   // so that no TLSDESC relocation exist at runtime.
   assert(supports_tlsdesc<E>);
-  assert(!ctx.arg.is_static);
+  assert(!ctx.arg.static_);
 
   sym->set_tlsdesc_idx(ctx, this->shdr.sh_size / sizeof(Word<E>));
   this->shdr.sh_size += sizeof(Word<E>) * 2;
@@ -1262,11 +1436,11 @@ void GotSection<E>::copy_buf(Context<E> &ctx) {
       buf[0] = ctx.dynamic->shdr.sh_addr;
 
   // arm64 psABI doesn't say anything about GOT[0], but glibc/arm64's code
-  // path for -static-pie wrongly assumed that GOT[0] refers _DYNAMIC.
+  // path for -static-pie wrongly assumed that GOT[0] refers to _DYNAMIC.
   //
   // https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=43d06ed218fc8be5
   if constexpr (is_arm64<E>)
-    if (ctx.dynamic && ctx.arg.is_static && ctx.arg.pie)
+    if (ctx.dynamic && ctx.arg.static_ && ctx.arg.pie)
       buf[0] = ctx.dynamic->shdr.sh_addr;
 
   ElfRel<E> *rel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
@@ -1307,13 +1481,10 @@ void GotSection<E>::copy_buf(Context<E> &ctx) {
 
 template <typename E>
 void GotSection<E>::construct_relr(Context<E> &ctx) {
-  assert(ctx.arg.pack_dyn_relocs_relr);
-
   std::vector<u64> pos;
   for (GotEntry<E> &ent : get_got_entries(ctx))
     if (ent.is_relr(ctx))
       pos.push_back(ent.idx * sizeof(Word<E>));
-
   this->relr = encode_relr<E>(pos);
 }
 
@@ -1600,9 +1771,6 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
   if constexpr (is_ppc64v2<E>)
     esym.ppc_local_entry = sym.esym().ppc_local_entry;
 
-  if constexpr (is_alpha<E>)
-    esym.alpha_st_other = sym.esym().alpha_st_other;
-
   auto get_st_shndx = [&](Symbol<E> &sym) -> u32 {
     if (SectionFragment<E> *frag = sym.get_frag())
       if (frag->is_alive)
@@ -1615,7 +1783,7 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
     if (InputSection<E> *isec = sym.get_input_section()) {
       if (isec->is_alive)
         return isec->output_section->shndx;
-      else if (isec->is_killed_by_icf())
+      if (isec->icf_removed())
         return isec->leader->output_section->shndx;
     }
 
@@ -1623,10 +1791,14 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
   };
 
   i64 shndx = -1;
+  InputSection<E> *isec = sym.get_input_section();
+
   if (sym.has_copyrel) {
+    // Symbol in .copyrel
     shndx = sym.is_copyrel_readonly ? ctx.copyrel_relro->shndx : ctx.copyrel->shndx;
     esym.st_value = sym.get_addr(ctx);
   } else if (sym.file->is_dso || sym.esym().is_undef()) {
+    // Undefined symbol in a DSO
     esym.st_shndx = SHN_UNDEF;
     esym.st_size = 0;
     if (sym.is_canonical)
@@ -1639,7 +1811,7 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
     // Section fragment
     shndx = frag->output_section.shndx;
     esym.st_value = sym.get_addr(ctx);
-  } else if (!sym.get_input_section()) {
+  } else if (!isec) {
     // Absolute symbol
     esym.st_shndx = SHN_ABS;
     esym.st_value = sym.get_addr(ctx);
@@ -1653,7 +1825,22 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
     esym.st_type = STT_FUNC;
     esym.st_visibility = sym.visibility;
     esym.st_value = sym.get_plt_addr(ctx);
+  } else if ((isec->shdr().sh_flags & SHF_MERGE) &&
+             !(isec->shdr().sh_flags & SHF_ALLOC)) {
+    // Symbol in a mergeable non-SHF_ALLOC section, such as .debug_str
+    ObjectFile<E> *file = (ObjectFile<E> *)sym.file;
+    MergeableSection<E> &m =
+      *file->mergeable_sections[file->get_shndx(sym.esym())];
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = m.get_fragment(sym.esym().st_value);
+
+    shndx = m.parent.shndx;
+    esym.st_visibility = sym.visibility;
+    esym.st_value = frag->get_addr(ctx) + frag_addend;
   } else {
+    // Symbol in a regular section
     shndx = get_st_shndx(sym);
     esym.st_visibility = sym.visibility;
     esym.st_value = sym.get_addr(ctx, NO_PLT);
@@ -1677,8 +1864,6 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
 
 template <typename E>
 void DynsymSection<E>::add_symbol(Context<E> &ctx, Symbol<E> *sym) {
-  assert(!finalized);
-
   if (symbols.empty())
     symbols.resize(1);
 
@@ -1688,62 +1873,6 @@ void DynsymSection<E>::add_symbol(Context<E> &ctx, Symbol<E> *sym) {
   }
 }
 
-template <typename E>
-void DynsymSection<E>::finalize(Context<E> &ctx) {
-  Timer t(ctx, "DynsymSection::finalize");
-  assert(!finalized);
-  finalized = true;
-
-  if (symbols.empty())
-    return;
-
-  // Sort symbols. In any symtab, local symbols must precede global symbols.
-  auto first_global = std::stable_partition(symbols.begin() + 1, symbols.end(),
-                                            [&](Symbol<E> *sym) {
-    return sym->is_local(ctx);
-  });
-
-  // We also place undefined symbols before defined symbols for .gnu.hash.
-  // Defined symbols are sorted by their hashes for .gnu.hash.
-  if (ctx.gnu_hash) {
-    // Count the number of exported symbols to compute the size of .gnu.hash.
-    i64 num_exported = 0;
-    for (i64 i = 1; i < symbols.size(); i++)
-      if (symbols[i]->is_exported)
-        num_exported++;
-
-    u32 num_buckets = num_exported / ctx.gnu_hash->LOAD_FACTOR + 1;
-    ctx.gnu_hash->num_buckets = num_buckets;
-
-    tbb::parallel_for_each(first_global, symbols.end(), [&](Symbol<E> *sym) {
-      sym->set_djb_hash(ctx, djb_hash(sym->name()));
-    });
-
-    tbb::parallel_sort(first_global, symbols.end(),
-                       [&](Symbol<E> *a, Symbol<E> *b) {
-      if (a->is_exported != b->is_exported)
-        return b->is_exported;
-
-      return std::tuple(a->get_djb_hash(ctx) % num_buckets, a->name()) <
-             std::tuple(b->get_djb_hash(ctx) % num_buckets, b->name());
-    });
-  }
-
-  // Compute .dynstr size
-  ctx.dynstr->dynsym_offset = ctx.dynstr->shdr.sh_size;
-
-  tbb::enumerable_thread_specific<i64> size;
-  tbb::parallel_for((i64)1, (i64)symbols.size(), [&](i64 i) {
-    symbols[i]->set_dynsym_idx(ctx, i);
-    size.local() += symbols[i]->name().size() + 1;
-  });
-
-  ctx.dynstr->shdr.sh_size += size.combine(std::plus());
-
-  // ELF's symbol table sh_info holds the offset of the first global symbol.
-  this->shdr.sh_info = first_global - symbols.begin();
-}
-
 template <typename E>
 void DynsymSection<E>::update_shdr(Context<E> &ctx) {
   this->shdr.sh_link = ctx.dynstr->shndx;
@@ -1802,34 +1931,20 @@ void HashSection<E>::copy_buf(Context<E> &ctx) {
   }
 }
 
-template <typename E>
-static std::span<Symbol<E> *> get_exported_symbols(Context<E> &ctx) {
-  std::span<Symbol<E> *> syms = ctx.dynsym->symbols;
-  auto it = std::partition_point(syms.begin() + 1, syms.end(),
-                                 [](Symbol<E> *sym) {
-    return !sym->is_exported;
-  });
-  return syms.subspan(it - syms.begin());
-}
-
 template <typename E>
 void GnuHashSection<E>::update_shdr(Context<E> &ctx) {
   if (ctx.dynsym->symbols.empty())
     return;
 
-  this->shdr.sh_link = ctx.dynsym->shndx;
-
-  i64 num_exported = get_exported_symbols(ctx).size();
-  if (num_exported) {
-    // We allocate 12 bits for each symbol in the bloom filter.
-    i64 num_bits = num_exported * 12;
-    num_bloom = bit_ceil(num_bits / (sizeof(Word<E>) * 8));
-  }
+  // We allocate 12 bits for each symbol in the bloom filter.
+  num_bloom = bit_ceil((num_exported * 12) / (sizeof(Word<E>) * 8));
 
   this->shdr.sh_size = HEADER_SIZE;                  // Header
   this->shdr.sh_size += num_bloom * sizeof(Word<E>); // Bloom filter
   this->shdr.sh_size += num_buckets * 4;             // Hash buckets
   this->shdr.sh_size += num_exported * 4;            // Hash values
+
+  this->shdr.sh_link = ctx.dynsym->shndx;
 }
 
 template <typename E>
@@ -1837,12 +1952,15 @@ void GnuHashSection<E>::copy_buf(Context<E> &ctx) {
   u8 *base = ctx.buf + this->shdr.sh_offset;
   memset(base, 0, this->shdr.sh_size);
 
-  std::span<Symbol<E> *> syms = get_exported_symbols(ctx);
-  std::vector<u32> indices(syms.size());
-  i64 exported_offset = ctx.dynsym->symbols.size() - syms.size();
+  i64 first_exported = ctx.dynsym->symbols.size() - num_exported;
+
+  std::span<Symbol<E> *> syms = ctx.dynsym->symbols;
+  syms = syms.subspan(first_exported);
+
+  std::vector<u32> indices(num_exported);
 
   *(U32<E> *)base = num_buckets;
-  *(U32<E> *)(base + 4) = exported_offset;
+  *(U32<E> *)(base + 4) = first_exported;
   *(U32<E> *)(base + 8) = num_bloom;
   *(U32<E> *)(base + 12) = BLOOM_SHIFT;
 
@@ -1865,7 +1983,7 @@ void GnuHashSection<E>::copy_buf(Context<E> &ctx) {
 
   for (i64 i = 0; i < syms.size(); i++)
     if (!buckets[indices[i]])
-      buckets[indices[i]] = i + exported_offset;
+      buckets[indices[i]] = i + first_exported;
 
   // Write a hash table
   U32<E> *table = buckets + num_buckets;
@@ -1915,15 +2033,26 @@ MergedSection<E>::MergedSection(std::string_view name, i64 flags, i64 type,
 template <typename E>
 MergedSection<E> *
 MergedSection<E>::get_instance(Context<E> &ctx, std::string_view name,
-                               i64 type, i64 flags,
-                               i64 entsize, i64 addralign) {
+                               const ElfShdr<E> &shdr) {
+  if (!(shdr.sh_flags & SHF_MERGE))
+    return nullptr;
+
+  i64 addralign = std::max<i64>(1, shdr.sh_addralign);
+  i64 flags = shdr.sh_flags & ~(u64)SHF_GROUP & ~(u64)SHF_COMPRESSED;
+
+  i64 entsize = shdr.sh_entsize;
+  if (entsize == 0)
+    entsize = (shdr.sh_flags & SHF_STRINGS) ? 1 : (i64)shdr.sh_addralign;
+  if (entsize == 0)
+    return nullptr;
+
   name = get_merged_output_name(ctx, name, flags, entsize, addralign);
-  flags = flags & ~(u64)SHF_GROUP & ~(u64)SHF_COMPRESSED;
 
   auto find = [&]() -> MergedSection * {
     for (std::unique_ptr<MergedSection<E>> &osec : ctx.merged_sections)
       if (name == osec->name && flags == osec->shdr.sh_flags &&
-          type == osec->shdr.sh_type && entsize == osec->shdr.sh_entsize)
+          shdr.sh_type == osec->shdr.sh_type &&
+          entsize == osec->shdr.sh_entsize)
         return osec.get();
     return nullptr;
   };
@@ -1941,7 +2070,7 @@ MergedSection<E>::get_instance(Context<E> &ctx, std::string_view name,
   if (MergedSection *osec = find())
     return osec;
 
-  MergedSection *osec = new MergedSection(name, flags, type, entsize);
+  MergedSection *osec = new MergedSection(name, flags, shdr.sh_type, entsize);
   ctx.merged_sections.emplace_back(osec);
   return osec;
 }
@@ -1962,7 +2091,55 @@ MergedSection<E>::insert(Context<E> &ctx, std::string_view data, u64 hash,
 }
 
 template <typename E>
-void MergedSection<E>::assign_offsets(Context<E> &ctx) {
+static std::string get_cmdline_args(Context<E> &ctx) {
+  std::stringstream ss;
+  ss << ctx.cmdline_args[1];
+  for (i64 i = 2; i < ctx.cmdline_args.size(); i++)
+    ss << " " << ctx.cmdline_args[i];
+  return ss.str();
+}
+
+// Add strings to .comment
+template <typename E>
+static void add_comment_strings(Context<E> &ctx) {
+  auto add = [&](std::string str) {
+    std::string_view buf = save_string(ctx, str);
+    std::string_view data(buf.data(), buf.size() + 1);
+    ctx.comment->insert(ctx, data, hash_string(data), 0);
+  };
+
+  // Add an identification string to .comment.
+  add(get_mold_version());
+
+  // Embed command line arguments for debugging.
+  char *env = getenv("MOLD_DEBUG");
+  if (env && env[0])
+    add("mold command line: " + get_cmdline_args(ctx));
+}
+
+template <typename E>
+void MergedSection<E>::resolve(Context<E> &ctx) {
+  tbb::parallel_for_each(members, [&](MergeableSection<E> *sec) {
+    sec->split_contents(ctx);
+  });
+
+  // We aim 2/3 occupation ratio
+  map.resize(estimator.get_cardinality() * 3 / 2);
+
+  tbb::parallel_for_each(members, [&](MergeableSection<E> *sec) {
+    sec->resolve_contents(ctx);
+  });
+
+  if (this == ctx.comment)
+    add_comment_strings(ctx);
+  resolved = true;
+}
+
+template <typename E>
+void MergedSection<E>::compute_section_size(Context<E> &ctx) {
+  if (!resolved)
+    resolve(ctx);
+
   std::vector<i64> sizes(map.NUM_SHARDS);
   Atomic<i64> alignment = 1;
 
@@ -2014,11 +2191,11 @@ void MergedSection<E>::assign_offsets(Context<E> &ctx) {
 
 template <typename E>
 void MergedSection<E>::copy_buf(Context<E> &ctx) {
-  write_to(ctx, ctx.buf + this->shdr.sh_offset);
+  write_to(ctx, ctx.buf + this->shdr.sh_offset, nullptr);
 }
 
 template <typename E>
-void MergedSection<E>::write_to(Context<E> &ctx, u8 *buf) {
+void MergedSection<E>::write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) {
   i64 shard_size = map.nbuckets / map.NUM_SHARDS;
 
   tbb::parallel_for((i64)0, map.NUM_SHARDS, [&](i64 i) {
@@ -2051,14 +2228,6 @@ template <typename E>
 void EhFrameSection<E>::construct(Context<E> &ctx) {
   Timer t(ctx, "eh_frame");
 
-  // If .eh_frame is missing in all input files, we don't want to
-  // create an output .eh_frame section.
-  if (std::all_of(ctx.objs.begin(), ctx.objs.end(),
-                  [](ObjectFile<E> *file) { return file->cies.empty(); })) {
-    this->shdr.sh_size = 0;
-    return;
-  }
-
   // Remove dead FDEs and assign them offsets within their corresponding
   // CIE group.
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
@@ -2291,6 +2460,16 @@ void CopyrelSection<E>::add_symbol(Context<E> &ctx, Symbol<E> *sym) {
   assert(!ctx.arg.shared);
   assert(sym->file->is_dso);
 
+  if (sym->esym().st_visibility == STV_PROTECTED)
+    Error(ctx) << *sym->file
+               << ": cannot create a copy relocation for protected symbol '"
+               << *sym << "'; recompile with -fPIC";
+
+  if (!ctx.arg.z_copyreloc)
+    Error(ctx) << "-z nocopyreloc: " << *sym->file
+               << ": cannot create a copy relocation for symbol '" << *sym
+               << "'; recompile with -fPIC";
+
   symbols.push_back(sym);
 
   SharedFile<E> &file = *(SharedFile<E> *)sym->file;
@@ -2347,8 +2526,8 @@ void VersymSection<E>::copy_buf(Context<E> &ctx) {
 //
 // .relr.dyn is relatively new feature and not supported by glibc until
 // 2.38 which was released in 2022. If we don't do anything, executables
-// built with `-z pack-relative-relocs` 't work and would crash
-// immediately on startup with an older version of glibc.
+// built with `-z pack-relative-relocs` would just crash immediately on
+// startup with an older version of glibc.
 //
 // As a workaround, we'll add a dependency to a dummy version name
 // "GLIBC_ABI_DT_RELR" if `-z pack-relative-relocs` is given so that
@@ -2526,89 +2705,21 @@ void VerdefSection<E>::copy_buf(Context<E> &ctx) {
   write_vector(ctx.buf + this->shdr.sh_offset, contents);
 }
 
-inline i64 BuildId::size() const {
-  switch (kind) {
-  case HEX:
-    return value.size();
-  case HASH:
-    return hash_size;
-  case UUID:
-    return 16;
-  default:
-    unreachable();
-  }
-}
-
 template <typename E>
 void BuildIdSection<E>::update_shdr(Context<E> &ctx) {
-  this->shdr.sh_size = HEADER_SIZE + ctx.arg.build_id.size();
+  this->shdr.sh_size = ctx.arg.build_id.size() + 16; // +16 for the header
 }
 
 template <typename E>
 void BuildIdSection<E>::copy_buf(Context<E> &ctx) {
   U32<E> *base = (U32<E> *)(ctx.buf + this->shdr.sh_offset);
   memset(base, 0, this->shdr.sh_size);
-  base[0] = 4;                          // Name size
-  base[1] = ctx.arg.build_id.size();    // Hash size
-  base[2] = NT_GNU_BUILD_ID;            // Type
-  memcpy(base + 3, "GNU", 4);           // Name string
-}
 
-// BLAKE3 is a cryptographic hash function just like SHA256.
-// We use it instead of SHA256 because it's faster.
-static void blake3_hash(u8 *buf, i64 size, u8 *out) {
-  blake3_hasher hasher;
-  blake3_hasher_init(&hasher);
-  blake3_hasher_update(&hasher, buf, size);
-  blake3_hasher_finalize(&hasher, out, BLAKE3_OUT_LEN);
-}
-
-template <typename E>
-void BuildIdSection<E>::write_buildid(Context<E> &ctx) {
-  Timer t(ctx, "build_id");
-  u8 *buf = ctx.buf + this->shdr.sh_offset + HEADER_SIZE;
-
-  switch (ctx.arg.build_id.kind) {
-  case BuildId::HEX:
-    write_vector(buf, ctx.arg.build_id.value);
-    return;
-  case BuildId::HASH: {
-    i64 shard_size = 4 * 1024 * 1024;
-    i64 filesize = ctx.output_file->filesize;
-    i64 num_shards = align_to(filesize, shard_size) / shard_size;
-    std::vector<u8> shards(num_shards * BLAKE3_OUT_LEN);
-
-    tbb::parallel_for((i64)0, num_shards, [&](i64 i) {
-      u8 *begin = ctx.buf + shard_size * i;
-      u8 *end = (i == num_shards - 1) ? ctx.buf + filesize : begin + shard_size;
-      blake3_hash(begin, end - begin, shards.data() + i * BLAKE3_OUT_LEN);
-
-#ifdef HAVE_MADVISE
-      // Make the kernel page out the file contents we've just written
-      // so that subsequent close(2) call will become quicker.
-      if (i > 0 && ctx.output_file->is_mmapped)
-        madvise(begin, end - begin, MADV_DONTNEED);
-#endif
-    });
-
-    u8 digest[BLAKE3_OUT_LEN];
-    blake3_hash(shards.data(), shards.size(), digest);
-
-    assert(ctx.arg.build_id.size() <= BLAKE3_OUT_LEN);
-    memcpy(buf, digest, ctx.arg.build_id.size());
-    return;
-  }
-  case BuildId::UUID: {
-    get_random_bytes(buf, 16);
-
-    // Indicate that this is UUIDv4 as defined by RFC4122
-    buf[6] = (buf[6] & 0b0000'1111) | 0b0100'0000;
-    buf[8] = (buf[8] & 0b0011'1111) | 0b1000'0000;
-    return;
-  }
-  default:
-    unreachable();
-  }
+  base[0] = 4;                       // Name size
+  base[1] = ctx.arg.build_id.size(); // Hash size
+  base[2] = NT_GNU_BUILD_ID;         // Type
+  memcpy(base + 3, "GNU", 4);        // Name string
+  write_vector(base + 4, contents);  // Build ID
 }
 
 template <typename E>
@@ -2692,6 +2803,8 @@ void NotePropertySection<E>::update_shdr(Context<E> &ctx) {
   if (ctx.arg.z_shstk)
     properties[GNU_PROPERTY_X86_FEATURE_1_AND] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
 
+  properties[GNU_PROPERTY_X86_ISA_1_NEEDED] |= ctx.arg.z_x86_64_isa_level;
+
   std::erase_if(properties, [](std::pair<u32, u32> kv) {
     return kv.second == 0;
   });
@@ -2730,7 +2843,7 @@ CompressedSection<E>::CompressedSection(Context<E> &ctx, Chunk<E> &chunk) {
   this->uncompressed_data.resize(chunk.shdr.sh_size);
   u8 *buf = this->uncompressed_data.data();
 
-  chunk.write_to(ctx, buf);
+  chunk.write_to(ctx, buf, nullptr);
 
   switch (ctx.arg.compress_debug_sections) {
   case COMPRESS_ZLIB:
@@ -2846,10 +2959,6 @@ void RelocSection<E>::copy_buf(Context<E> &ctx) {
     i64 addend;
     std::tie(symidx, addend) = get_symidx_addend(isec, rel);
 
-    if constexpr (is_alpha<E>)
-      if (rel.r_type == R_ALPHA_GPDISP || rel.r_type == R_ALPHA_LITUSE)
-        addend = rel.r_addend;
-
     i64 r_offset = isec.output_section->shdr.sh_addr + isec.offset + rel.r_offset;
     out = ElfRel<E>(r_offset, rel.r_type, symidx, addend);
 
@@ -2888,6 +2997,20 @@ void ComdatGroupSection<E>::copy_buf(Context<E> &ctx) {
     *buf++ = chunk->shndx;
 }
 
+template <typename E>
+void GnuDebuglinkSection<E>::update_shdr(Context<E> &ctx) {
+  filename = std::filesystem::path(ctx.arg.separate_debug_file).filename().string();
+  this->shdr.sh_size = align_to(filename.size() + 1, 4) + 4;
+}
+
+template <typename E>
+void GnuDebuglinkSection<E>::copy_buf(Context<E> &ctx) {
+  u8 *buf = ctx.buf + this->shdr.sh_offset;
+  memset(buf, 0, this->shdr.sh_size);
+  write_string(buf, filename);
+  *(U32<E> *)(buf + this->shdr.sh_size - 4) = crc32;
+}
+
 using E = MOLD_TARGET;
 
 template class Chunk<E>;
@@ -2926,10 +3049,11 @@ template class GdbIndexSection<E>;
 template class CompressedSection<E>;
 template class RelocSection<E>;
 template class ComdatGroupSection<E>;
+template class GnuDebuglinkSection<E>;
 
-template OutputSection<E> *find_section(Context<E> &, u32);
-template OutputSection<E> *find_section(Context<E> &, std::string_view);
+template Chunk<E> *find_chunk(Context<E> &, u32);
+template Chunk<E> *find_chunk(Context<E> &, std::string_view);
 template i64 to_phdr_flags(Context<E> &ctx, Chunk<E> *chunk);
 template ElfSym<E> to_output_esym(Context<E> &, Symbol<E> &, u32, U32<E> *);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/src/output-file-unix.cc b/src/output-file-unix.cc
new file mode 100644
index 00000000..0a6f9eb2
--- /dev/null
+++ b/src/output-file-unix.cc
@@ -0,0 +1,200 @@
+#include "mold.h"
+
+#include <fcntl.h>
+#include <filesystem>
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace mold {
+
+static u32 get_umask() {
+  u32 orig_umask = umask(0);
+  umask(orig_umask);
+  return orig_umask;
+}
+
+template <typename E>
+static int
+open_or_create_file(Context<E> &ctx, std::string path, std::string tmpfile,
+                    int perm) {
+  // Reuse an existing file if exists and writable because on Linux,
+  // writing to an existing file is much faster than creating a fresh
+  // file and writing to it.
+  if (ctx.overwrite_output_file && rename(path.c_str(), tmpfile.c_str()) == 0) {
+    i64 fd = ::open(tmpfile.c_str(), O_RDWR | O_CREAT, perm);
+    if (fd != -1)
+      return fd;
+    unlink(tmpfile.c_str());
+  }
+
+  i64 fd = ::open(tmpfile.c_str(), O_RDWR | O_CREAT, perm);
+  if (fd == -1)
+    Fatal(ctx) << "cannot open " << tmpfile << ": " << errno_string();
+  return fd;
+}
+
+template <typename E>
+class MemoryMappedOutputFile : public OutputFile<E> {
+public:
+  MemoryMappedOutputFile(Context<E> &ctx, std::string path, i64 filesize, int perm)
+    : OutputFile<E>(path, filesize, true) {
+    std::filesystem::path dir = filepath(path).parent_path();
+    std::string filename = filepath(path).filename().string();
+    std::string tmpfile = dir / ("." + filename + "." + std::to_string(getpid()));
+
+    this->fd = open_or_create_file(ctx, path, tmpfile, perm);
+
+    if (fchmod(this->fd, perm & ~get_umask()) == -1)
+      Fatal(ctx) << "fchmod failed: " << errno_string();
+
+    if (ftruncate(this->fd, filesize) == -1)
+      Fatal(ctx) << "ftruncate failed: " << errno_string();
+
+    output_tmpfile = (char *)save_string(ctx, tmpfile).data();
+
+#ifdef __linux__
+    fallocate(this->fd, 0, 0, filesize);
+#endif
+
+    this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE,
+                           MAP_SHARED, this->fd, 0);
+    if (this->buf == MAP_FAILED)
+      Fatal(ctx) << path << ": mmap failed: " << errno_string();
+
+    mold::output_buffer_start = this->buf;
+    mold::output_buffer_end = this->buf + filesize;
+  }
+
+  ~MemoryMappedOutputFile() {
+    if (fd2 != -1)
+      ::close(fd2);
+  }
+
+  void close(Context<E> &ctx) override {
+    Timer t(ctx, "close_file");
+
+    if (!this->is_unmapped)
+      munmap(this->buf, this->filesize);
+
+    if (this->buf2.empty()) {
+      ::close(this->fd);
+    } else {
+      FILE *out = fdopen(this->fd, "w");
+      fseek(out, 0, SEEK_END);
+      fwrite(&this->buf2[0], this->buf2.size(), 1, out);
+      fclose(out);
+    }
+
+    // If an output file already exists, open a file and then remove it.
+    // This is the fastest way to unlink a file, as it does not make the
+    // system to immediately release disk blocks occupied by the file.
+    fd2 = ::open(this->path.c_str(), O_RDONLY);
+    if (fd2 != -1)
+      unlink(this->path.c_str());
+
+    if (rename(output_tmpfile, this->path.c_str()) == -1)
+      Fatal(ctx) << this->path << ": rename failed: " << errno_string();
+    output_tmpfile = nullptr;
+  }
+
+private:
+  int fd2 = -1;
+};
+
+template <typename E>
+std::unique_ptr<OutputFile<E>>
+OutputFile<E>::open(Context<E> &ctx, std::string path, i64 filesize, int perm) {
+  Timer t(ctx, "open_file");
+
+  if (path.starts_with('/') && !ctx.arg.chroot.empty())
+    path = ctx.arg.chroot + "/" + path_clean(path);
+
+  bool is_special = false;
+  if (path == "-") {
+    is_special = true;
+  } else {
+    struct stat st;
+    if (stat(path.c_str(), &st) == 0 && (st.st_mode & S_IFMT) != S_IFREG)
+      is_special = true;
+  }
+
+  OutputFile<E> *file;
+  if (is_special)
+    file = new MallocOutputFile(ctx, path, filesize, perm);
+  else
+    file = new MemoryMappedOutputFile(ctx, path, filesize, perm);
+
+#ifdef MADV_HUGEPAGE
+  // Enable transparent huge page for an output memory-mapped file.
+  // On Linux, it has an effect only on tmpfs mounted with `huge=advise`,
+  // but it can make the linker ~10% faster. You can try it by creating
+  // a tmpfs with the following commands
+  //
+  //  $ mkdir tmp
+  //  $ sudo mount -t tmpfs -o size=2G,huge=advise none tmp
+  //
+  // and then specifying a path under the directory as an output file.
+  madvise(file->buf, filesize, MADV_HUGEPAGE);
+#endif
+
+  if (ctx.arg.filler != -1)
+    memset(file->buf, ctx.arg.filler, filesize);
+  return std::unique_ptr<OutputFile>(file);
+}
+
+// LockingOutputFile is similar to MemoryMappedOutputFile, but it doesn't
+// rename output files and instead acquires file lock using flock().
+template <typename E>
+LockingOutputFile<E>::LockingOutputFile(Context<E> &ctx, std::string path,
+                                        int perm)
+  : OutputFile<E>(path, 0, true) {
+  this->fd = ::open(path.c_str(), O_RDWR | O_CREAT, perm);
+  if (this->fd == -1)
+    Fatal(ctx) << "cannot open " << path << ": " << errno_string();
+  flock(this->fd, LOCK_EX);
+
+  // We may be overwriting to an existing debug info file. We want to
+  // make the file unusable so that gdb won't use it by accident until
+  // it's ready.
+  u8 buf[256] = {};
+  (void)!!write(this->fd, buf, sizeof(buf));
+}
+
+template <typename E>
+void LockingOutputFile<E>::resize(Context<E> &ctx, i64 filesize) {
+  if (ftruncate(this->fd, filesize) == -1)
+    Fatal(ctx) << "ftruncate failed: " << errno_string();
+
+  this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE,
+                         MAP_SHARED, this->fd, 0);
+  if (this->buf == MAP_FAILED)
+    Fatal(ctx) << this->path << ": mmap failed: " << errno_string();
+
+  this->filesize = filesize;
+  mold::output_buffer_start = this->buf;
+  mold::output_buffer_end = this->buf + filesize;
+}
+
+template <typename E>
+void LockingOutputFile<E>::close(Context<E> &ctx) {
+  if (!this->is_unmapped)
+    munmap(this->buf, this->filesize);
+
+  if (!this->buf2.empty()) {
+    FILE *out = fdopen(this->fd, "w");
+    fseek(out, 0, SEEK_END);
+    fwrite(&this->buf2[0], this->buf2.size(), 1, out);
+    fclose(out);
+  }
+
+  ::close(this->fd);
+}
+
+using E = MOLD_TARGET;
+
+template class OutputFile<E>;
+template class LockingOutputFile<E>;
+
+} // namespace mold
diff --git a/src/output-file-win32.cc b/src/output-file-win32.cc
new file mode 100644
index 00000000..68bd26c8
--- /dev/null
+++ b/src/output-file-win32.cc
@@ -0,0 +1,118 @@
+#include "mold.h"
+
+#include <fcntl.h>
+#include <filesystem>
+#include <windows.h>
+
+namespace mold {
+
+template <typename E>
+class MemoryMappedOutputFile : public OutputFile<E> {
+public:
+  MemoryMappedOutputFile(Context<E> &ctx, std::string path, i64 filesize, int perm)
+      : OutputFile<E>(path, filesize, true) {
+    // TODO: use intermediate temporary file for output.
+    DWORD attrs = (perm & 0200) ? FILE_ATTRIBUTE_NORMAL : FILE_ATTRIBUTE_READONLY;
+
+    handle = CreateFileA(path.c_str(), GENERIC_READ | GENERIC_WRITE,
+                         FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                         nullptr, CREATE_ALWAYS, attrs, nullptr);
+    if (handle == INVALID_HANDLE_VALUE)
+      Fatal(ctx) << "cannot open " << path << ": " << GetLastError();
+
+    HANDLE map = CreateFileMapping(handle, nullptr, PAGE_READWRITE, 0,
+                                   filesize, nullptr);
+    if (!map)
+      Fatal(ctx) << path << ": CreateFileMapping failed: " << GetLastError();
+
+    this->buf = (u8 *)MapViewOfFile(map, FILE_MAP_WRITE, 0, 0, filesize);
+    if (!this->buf)
+      Fatal(ctx) << path << ": MapViewOfFile failed: " << GetLastError();
+
+    CloseHandle(map);
+
+    mold::output_buffer_start = this->buf;
+    mold::output_buffer_end = this->buf + filesize;
+  }
+
+  ~MemoryMappedOutputFile() {
+    if (handle != INVALID_HANDLE_VALUE)
+      CloseHandle(handle);
+  }
+
+  void close(Context<E> &ctx) override {
+    Timer t(ctx, "close_file");
+
+    UnmapViewOfFile(this->buf);
+
+    if (!this->buf2.empty()) {
+      if (SetFilePointer(handle, 0, nullptr, FILE_END) == INVALID_SET_FILE_POINTER)
+        Fatal(ctx) << this->path << ": SetFilePointer failed: "
+                   << GetLastError();
+
+      DWORD written;
+      if (!WriteFile(handle, this->buf2.data(), this->buf2.size(), &written,
+                     nullptr))
+        Fatal(ctx) << this->path << ": WriteFile failed: " << GetLastError();
+    }
+
+    CloseHandle(handle);
+    handle = INVALID_HANDLE_VALUE;
+  }
+
+private:
+  HANDLE handle;
+};
+
+template <typename E>
+std::unique_ptr<OutputFile<E>>
+OutputFile<E>::open(Context<E> &ctx, std::string path, i64 filesize, int perm) {
+  Timer t(ctx, "open_file");
+
+  if (path.starts_with('/') && !ctx.arg.chroot.empty())
+    path = ctx.arg.chroot + "/" + path_clean(path);
+
+  bool is_special = false;
+  if (path == "-") {
+    is_special = true;
+  } else {
+    HANDLE h = CreateFileA(path.c_str(), GENERIC_READ,
+                           FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                           nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
+    if (h != INVALID_HANDLE_VALUE) {
+      if (GetFileType(h) != FILE_TYPE_DISK)
+        is_special = true;
+      CloseHandle(h);
+    }
+  }
+
+  OutputFile<E> *file;
+  if (is_special)
+    file = new MallocOutputFile(ctx, path, filesize, perm);
+  else
+    file = new MemoryMappedOutputFile(ctx, path, filesize, perm);
+
+  if (ctx.arg.filler != -1)
+    memset(file->buf, ctx.arg.filler, filesize);
+  return std::unique_ptr<OutputFile<E>>(file);
+}
+
+template <typename E>
+LockingOutputFile<E>::LockingOutputFile(Context<E> &ctx, std::string path,
+                                        int perm)
+  : OutputFile<E>(path, 0, true) {
+  Fatal(ctx) << "LockingOutputFile is not supported on Windows";
+}
+
+template <typename E>
+void LockingOutputFile<E>::resize(Context<E> &ctx, i64 filesize) {}
+
+template <typename E>
+void LockingOutputFile<E>::close(Context<E> &ctx) {}
+
+using E = MOLD_TARGET;
+
+template class OutputFile<E>;
+template class LockingOutputFile<E>;
+
+} // namespace mold
diff --git a/elf/passes.cc b/src/passes.cc
similarity index 83%
rename from elf/passes.cc
rename to src/passes.cc
index 49fa569f..807bb2bc 100644
--- a/elf/passes.cc
+++ b/src/passes.cc
@@ -1,4 +1,5 @@
 #include "mold.h"
+#include "blake3.h"
 
 #include <fstream>
 #include <functional>
@@ -11,49 +12,47 @@
 #include <tbb/partitioner.h>
 #include <unordered_set>
 
-namespace mold::elf {
+namespace mold {
 
-// Since elf_main is a template, we can't run it without a type parameter.
-// We speculatively run elf_main with X86_64, and if the speculation was
+// Since mold_main is a template, we can't run it without a type parameter.
+// We speculatively run mold_main with X86_64, and if the speculation was
 // wrong, re-run it with an actual machine type.
 template <typename E>
 int redo_main(Context<E> &ctx, int argc, char **argv) {
   std::string_view target = ctx.arg.emulation;
 
   if (target == I386::target_name)
-    return elf_main<I386>(argc, argv);
+    return mold_main<I386>(argc, argv);
   if (target == ARM64::target_name)
-    return elf_main<ARM64>(argc, argv);
+    return mold_main<ARM64>(argc, argv);
   if (target == ARM32::target_name)
-    return elf_main<ARM32>(argc, argv);
+    return mold_main<ARM32>(argc, argv);
   if (target == RV64LE::target_name)
-    return elf_main<RV64LE>(argc, argv);
+    return mold_main<RV64LE>(argc, argv);
   if (target == RV64BE::target_name)
-    return elf_main<RV64BE>(argc, argv);
+    return mold_main<RV64BE>(argc, argv);
   if (target == RV32LE::target_name)
-    return elf_main<RV32LE>(argc, argv);
+    return mold_main<RV32LE>(argc, argv);
   if (target == RV32BE::target_name)
-    return elf_main<RV32BE>(argc, argv);
+    return mold_main<RV32BE>(argc, argv);
   if (target == PPC32::target_name)
-    return elf_main<PPC32>(argc, argv);
+    return mold_main<PPC32>(argc, argv);
   if (target == PPC64V1::target_name)
-    return elf_main<PPC64V1>(argc, argv);
+    return mold_main<PPC64V1>(argc, argv);
   if (target == PPC64V2::target_name)
-    return elf_main<PPC64V2>(argc, argv);
+    return mold_main<PPC64V2>(argc, argv);
   if (target == S390X::target_name)
-    return elf_main<S390X>(argc, argv);
+    return mold_main<S390X>(argc, argv);
   if (target == SPARC64::target_name)
-    return elf_main<SPARC64>(argc, argv);
+    return mold_main<SPARC64>(argc, argv);
   if (target == M68K::target_name)
-    return elf_main<M68K>(argc, argv);
+    return mold_main<M68K>(argc, argv);
   if (target == SH4::target_name)
-    return elf_main<SH4>(argc, argv);
-  if (target == ALPHA::target_name)
-    return elf_main<ALPHA>(argc, argv);
+    return mold_main<SH4>(argc, argv);
   if (target == LOONGARCH32::target_name)
-    return elf_main<LOONGARCH32>(argc, argv);
+    return mold_main<LOONGARCH32>(argc, argv);
   if (target == LOONGARCH64::target_name)
-    return elf_main<LOONGARCH64>(argc, argv);
+    return mold_main<LOONGARCH64>(argc, argv);
   unreachable();
 }
 
@@ -155,6 +154,8 @@ void create_synthetic_sections(Context<E> &ctx) {
     ctx.verdef = push(new VerdefSection<E>);
   if (ctx.arg.emit_relocs)
     ctx.eh_frame_reloc = push(new EhFrameRelocSection<E>);
+  if (!ctx.arg.separate_debug_file.empty())
+    ctx.gnu_debuglink = push(new GnuDebuglinkSection<E>);
 
   if (ctx.arg.shared || !ctx.dsos.empty() || ctx.arg.pie) {
     ctx.dynamic = push(new DynamicSection<E>(ctx));
@@ -170,6 +171,13 @@ void create_synthetic_sections(Context<E> &ctx) {
   ctx.note_package = push(new NotePackageSection<E>);
   ctx.note_property = push(new NotePropertySection<E>);
 
+  if (!ctx.arg.oformat_binary) {
+    ElfShdr<E> shdr = {};
+    shdr.sh_type = SHT_PROGBITS;
+    shdr.sh_flags = SHF_MERGE | SHF_STRINGS;
+    ctx.comment = MergedSection<E>::get_instance(ctx, ".comment", shdr);
+  }
+
   if constexpr (is_riscv<E>)
     ctx.extra.riscv_attributes = push(new RiscvAttributesSection<E>);
 
@@ -178,15 +186,6 @@ void create_synthetic_sections(Context<E> &ctx) {
 
   if constexpr (is_ppc64v2<E>)
     ctx.extra.save_restore = push(new PPC64SaveRestoreSection);
-
-  if constexpr (is_sparc<E>) {
-    if (ctx.arg.is_static)
-      ctx.extra.tls_get_addr_sec = push(new SparcTlsGetAddrSection);
-    ctx.extra.tls_get_addr_sym = get_symbol(ctx, "__tls_get_addr");
-  }
-
-  if constexpr (is_alpha<E>)
-    ctx.extra.got = push(new AlphaGotSection);
 }
 
 template <typename E>
@@ -252,173 +251,146 @@ static void clear_symbols(Context<E> &ctx) {
 }
 
 template <typename E>
-void do_resolve_symbols(Context<E> &ctx) {
+void resolve_symbols(Context<E> &ctx) {
+  Timer t(ctx, "resolve_symbols");
+
   std::vector<InputFile<E> *> files;
   append(files, ctx.objs);
   append(files, ctx.dsos);
 
-  // Due to legacy reasons, archive members will only get included in the final
-  // binary if they satisfy one of the undefined symbols in a non-archive object
-  // file. This is called archive extraction. In finalize_archive_extraction,
-  // this is processed as follows:
-  //
-  // 1. Do preliminary symbol resolution assuming all archive members
-  //    are included. This matches the undefined symbols with ones to be
-  //    extracted from archives.
-  //
-  // 2. Do a mark & sweep pass to eliminate unneeded archive members.
-  //
-  // Note that the symbol resolution inside finalize_archive_extraction uses a
-  // different rule. In order to prevent extracting archive members that can be
-  // satisfied by either non-archive object files or DSOs, the archive members
-  // are given a lower priority. This is not correct for the general case, where
-  // *extracted* object files have precedence over DSOs and even non-archive
-  // files that are passed earlier in the command line. Hence, the symbol
-  // resolution is thrown away once we determine which archive members to
-  // extract, and redone later with the formal rule.
-  {
-    Timer t(ctx, "extract_archive_members");
-
-    // Register symbols
+  for (;;) {
+    // Call resolve_symbols() to find the most appropriate file for each
+    // symbol. And then mark reachable objects to decide which files to
+    // include into an output.
     tbb::parallel_for_each(files, [&](InputFile<E> *file) {
       file->resolve_symbols(ctx);
     });
 
-    // Mark reachable objects to decide which files to include into an output.
-    // This also merges symbol visibility.
     mark_live_objects(ctx);
 
-    // Cleanup. The rule used for archive extraction isn't accurate for the
-    // general case of symbol extraction, so reset the resolution to be redone
-    // later.
+    // Now that we know the exact set of input files that are to be
+    // included in the output file, we want to redo symbol resolution.
+    // This is because symbols defined by object files in archive files
+    // may have risen as a result of mark_live_objects().
+    //
+    // To redo symbol resolution, we want to clear the state first.
     clear_symbols(ctx);
 
-    // Now that the symbol references are gone, remove the eliminated files from
-    // the file list.
-    std::erase_if(files, [](InputFile<E> *file) { return !file->is_alive; });
-    std::erase_if(ctx.objs, [](InputFile<E> *file) { return !file->is_alive; });
-    std::erase_if(ctx.dsos, [](InputFile<E> *file) { return !file->is_alive; });
-  }
-
-  // COMDAT elimination needs to happen exactly here.
-  //
-  // It needs to be after archive extraction, otherwise we might assign COMDAT
-  // leader to an archive member that is not supposed to be extracted.
-  //
-  // It needs to happen before symbol resolution, otherwise we could eliminate
-  // a symbol that is already resolved to and cause dangling references.
-  {
-    Timer t(ctx, "eliminate_comdats");
-
+    // COMDAT elimination needs to happen exactly here.
+    //
+    // It needs to be after archive extraction, otherwise we might
+    // assign COMDAT leader to an archive member that is not supposed to
+    // be extracted.
+    //
+    // It needs to happen before the final symbol resolution, otherwise
+    // we could eliminate a symbol that is already resolved to and cause
+    // dangling references.
     tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
-      for (ComdatGroupRef<E> &ref : file->comdat_groups)
-        update_minimum(ref.group->owner, file->priority);
+      if (file->is_alive)
+        for (ComdatGroupRef<E> &ref : file->comdat_groups)
+          update_minimum(ref.group->owner, file->priority);
     });
 
     tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
-      for (ComdatGroupRef<E> &ref : file->comdat_groups)
-        if (ref.group->owner != file->priority)
-          for (u32 i : ref.members)
-            if (file->sections[i])
-              file->sections[i]->kill();
+      if (file->is_alive)
+        for (ComdatGroupRef<E> &ref : file->comdat_groups)
+          if (ref.group->owner != file->priority)
+            for (u32 i : ref.members)
+              if (InputSection<E> *isec = file->sections[i].get())
+                isec->is_alive = false;
     });
-  }
 
-  // Since we have turned on object files live bits, their symbols
-  // may now have higher priority than before. So run the symbol
-  // resolution pass again to get the final resolution result.
-  tbb::parallel_for_each(files, [&](InputFile<E> *file) {
-    file->resolve_symbols(ctx);
-  });
-}
-
-template <typename E>
-void resolve_symbols(Context<E> &ctx) {
-  Timer t(ctx, "resolve_symbols");
+    // Redo symbol resolution
+    tbb::parallel_for_each(files, [&](InputFile<E> *file) {
+      if (file->is_alive)
+        file->resolve_symbols(ctx);
+    });
 
-  std::vector<ObjectFile<E> *> objs = ctx.objs;
-  std::vector<SharedFile<E> *> dsos = ctx.dsos;
+    // Symbols with hidden visibility need to be resolved within the
+    // output file. If a hidden symbol was resolved to a DSO, we'll redo
+    // symbol resolution from scratch with the flag to skip that symbol
+    // next time. This should be rare.
+    std::atomic_bool flag = false;
 
-  do_resolve_symbols(ctx);
+    tbb::parallel_for_each(ctx.dsos, [&](SharedFile<E> *file) {
+      if (file->is_alive) {
+        for (Symbol<E> *sym : file->symbols) {
+          if (sym->file == file && sym->visibility == STV_HIDDEN) {
+            sym->skip_dso = true;
+            flag = true;
+          }
+        }
+      }
+    });
 
-  bool has_lto_obj = false;
-  for (ObjectFile<E> *file : objs)
-    if (file->is_alive && (file->is_lto_obj || file->is_gcc_offload_obj))
-      has_lto_obj = true;
+    if (!flag)
+      return;
 
-  if (has_lto_obj) {
-    // Do link-time optimization. We pass all IR object files to the
-    // compiler backend to compile them into a few ELF object files.
-    //
-    // The compiler backend needs to know how symbols are resolved,
-    // so compute symbol visibility, import/export bits, etc early.
-    mark_live_objects(ctx);
-    apply_version_script(ctx);
-    parse_symbol_version(ctx);
-    compute_import_export(ctx);
+    clear_symbols(ctx);
+    resolve_symbols(ctx);
+  }
+}
 
-    // Do LTO. It compiles IR object files into a few big ELF files.
-    std::vector<ObjectFile<E> *> lto_objs = do_lto(ctx);
+// Do link-time optimization. We pass all IR object files to the compiler
+// backend to compile them into a few ELF object files.
+template <typename E>
+void do_lto(Context<E> &ctx) {
+  Timer t(ctx, "do_lto");
 
-    // do_resolve_symbols() have removed unreferenced files. Restore the
-    // original files here because some of them may have to be resurrected
-    // because they are referenced by the ELF files returned from do_lto().
-    ctx.objs = objs;
-    ctx.dsos = dsos;
+  // The compiler backend needs to know how symbols are resolved, so
+  // compute symbol visibility, import/export bits, etc early.
+  mark_live_objects(ctx);
+  apply_version_script(ctx);
+  parse_symbol_version(ctx);
+  compute_import_export(ctx);
 
-    append(ctx.objs, lto_objs);
+  // Invoke the LTO plugin. This step compiles IR object files into a few
+  // big ELF files.
+  std::vector<ObjectFile<E> *> lto_objs = run_lto_plugin(ctx);
+  append(ctx.objs, lto_objs);
 
-    // Redo name resolution from scratch.
-    clear_symbols(ctx);
+  // Redo name resolution.
+  clear_symbols(ctx);
 
-    // Remove IR object files.
-    for (ObjectFile<E> *file : ctx.objs)
-      if (file->is_lto_obj)
-        file->is_alive = false;
+  // Remove IR object files.
+  for (ObjectFile<E> *file : ctx.objs)
+    if (file->is_lto_obj)
+      file->is_alive = false;
 
-    std::erase_if(ctx.objs, [](ObjectFile<E> *file) { return file->is_lto_obj; });
+  std::erase_if(ctx.objs, [](ObjectFile<E> *file) { return file->is_lto_obj; });
 
-    do_resolve_symbols(ctx);
-  }
+  resolve_symbols(ctx);
 }
 
-// .eh_frame sections are parsed and regenerated by the linker for the purpose
-// of deduplication and garbage collection. As such, the input sections should
-// not be copied over.
-//
-// However, in very rare cases (e.g. GCC CRT compiled with LTO) we might need
-// to resolve cross-object .eh_frame section references (they only point to
-// begin or end and don't depend on the actual section contents).
-// Therefore, the sections are "killed" after symbol resolution as a separate
-// pass.
 template <typename E>
-void kill_eh_frame_sections(Context<E> &ctx) {
-  Timer t(ctx, "kill_eh_frame_sections");
+void parse_eh_frame_sections(Context<E> &ctx) {
+  Timer t(ctx, "parse_eh_frame_sections");
 
-  for (ObjectFile<E> *file : ctx.objs)
-    for (InputSection<E> *sec : file->eh_frame_sections)
-      sec->is_alive = false;
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    file->parse_ehframe(ctx);
+
+    for (InputSection<E> *isec : file->eh_frame_sections)
+      isec->is_alive = false;
+  });
 }
 
 template <typename E>
-void split_section_pieces(Context<E> &ctx) {
-  Timer t(ctx, "split_section_pieces");
+void create_merged_sections(Context<E> &ctx) {
+  Timer t(ctx, "create_merged_sections");
 
+  // Convert InputSections to MergeableSections.
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    file->initialize_mergeable_sections(ctx);
+    file->convert_mergeable_sections(ctx);
   });
-}
-
-template <typename E>
-void resolve_section_pieces(Context<E> &ctx) {
-  Timer t(ctx, "resolve_section_pieces");
 
-  // We aim 2/3 occupation ratio
-  for (std::unique_ptr<MergedSection<E>> &sec : ctx.merged_sections)
-    sec->map.resize(sec->estimator.get_cardinality() * 3 / 2);
+  tbb::parallel_for_each(ctx.merged_sections,
+                         [&](std::unique_ptr<MergedSection<E>> &sec) {
+    if (sec->shdr.sh_flags & SHF_ALLOC)
+      sec->resolve(ctx);
+  });
 
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    file->resolve_section_pieces(ctx);
+    file->reattach_section_pieces(ctx);
   });
 }
 
@@ -431,61 +403,6 @@ void convert_common_symbols(Context<E> &ctx) {
   });
 }
 
-template <typename E>
-static std::string get_cmdline_args(Context<E> &ctx) {
-  std::stringstream ss;
-  ss << ctx.cmdline_args[1];
-  for (i64 i = 2; i < ctx.cmdline_args.size(); i++)
-    ss << " " << ctx.cmdline_args[i];
-  return ss.str();
-}
-
-template <typename E>
-void add_comment_string(Context<E> &ctx, std::string str) {
-  MergedSection<E> *sec =
-    MergedSection<E>::get_instance(ctx, ".comment", SHT_PROGBITS,
-                                   SHF_MERGE | SHF_STRINGS, 1, 1);
-
-  if (sec->map.nbuckets == 0)
-    sec->map.resize(4096);
-
-  std::string_view buf = save_string(ctx, str);
-  std::string_view data(buf.data(), buf.size() + 1);
-  sec->insert(ctx, data, hash_string(data), 0);
-}
-
-template <typename E>
-void compute_merged_section_sizes(Context<E> &ctx) {
-  Timer t(ctx, "compute_merged_section_sizes");
-
-  // Add an identification string to .comment.
-  if (!ctx.arg.oformat_binary)
-    add_comment_string(ctx, get_mold_version());
-
-  // Embed command line arguments for debugging.
-  if (char *env = getenv("MOLD_DEBUG"); env && env[0])
-    add_comment_string(ctx, "mold command line: " + get_cmdline_args(ctx));
-
-  tbb::parallel_for_each(ctx.merged_sections,
-                         [&](std::unique_ptr<MergedSection<E>> &sec) {
-    sec->assign_offsets(ctx);
-  });
-}
-
-template <typename T>
-static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
-  std::span<T> span(input);
-  std::vector<std::span<T>> vec;
-
-  while (span.size() >= unit) {
-    vec.push_back(span.subspan(0, unit));
-    span = span.subspan(unit);
-  }
-  if (!span.empty())
-    vec.push_back(span);
-  return vec;
-}
-
 template <typename E>
 static bool has_ctors_and_init_array(Context<E> &ctx) {
   bool x = false;
@@ -550,13 +467,6 @@ get_output_name(Context<E> &ctx, std::string_view name, u64 flags) {
       return ".ARM.extab";
   }
 
-  if constexpr (is_alpha<E>) {
-    if (name.starts_with(".sdata."))
-      return ".sdata";
-    if (name.starts_with(".sbss."))
-      return ".sbss";
-  }
-
   if (ctx.arg.z_keep_text_section_prefix) {
     static std::string_view prefixes[] = {
       ".text.hot.", ".text.unknown.", ".text.unlikely.", ".text.startup.",
@@ -574,6 +484,7 @@ get_output_name(Context<E> &ctx, std::string_view name, u64 flags) {
     ".text.", ".data.rel.ro.", ".data.", ".rodata.", ".bss.rel.ro.", ".bss.",
     ".init_array.", ".fini_array.", ".tbss.", ".tdata.", ".gcc_except_table.",
     ".ctors.", ".dtors.", ".gnu.warning.", ".openbsd.randomdata.",
+    ".sdata.", ".sbss.", ".srodata",
   };
 
   for (std::string_view prefix : prefixes) {
@@ -725,8 +636,7 @@ void create_output_sections(Context<E> &ctx) {
 
   // Add output sections and mergeable sections to ctx.chunks
   for (std::unique_ptr<MergedSection<E>> &osec : ctx.merged_sections)
-    if (osec->shdr.sh_size)
-      chunks.push_back(osec.get());
+    chunks.push_back(osec.get());
 
   // Sections are added to the section lists in an arbitrary order
   // because they are created in parallel. Sort them to to make the
@@ -1027,7 +937,7 @@ R"(# This is an output of the mold linker's --print-dependencies option.
       std::unordered_set<void *> visited;
 
       for (const ElfRel<E> &r : isec->get_rels(ctx)) {
-        if (r.r_type == R_NONE)
+        if (r.r_type == R_NONE || file->elf_syms.size() <= r.r_sym)
           continue;
 
         ElfSym<E> &esym = file->elf_syms[r.r_sym];
@@ -1129,6 +1039,50 @@ void check_duplicate_symbols(Context<E> &ctx) {
   ctx.checkpoint();
 }
 
+// If --no-allow-shlib-undefined is specified, we report errors on
+// unresolved symbols in shared libraries. This is useful when you are
+// creating a final executable and want to make sure that all symbols
+// including ones in shared libraries have been resolved.
+//
+// If you do not pass --no-allow-shlib-undefined, undefined symbols in
+// shared libraries will be reported as run-time error by the dynamic
+// linker.
+template <typename E>
+void check_shlib_undefined(Context<E> &ctx) {
+  Timer t(ctx, "check_shlib_undefined");
+
+  auto is_sparc_register = [](const ElfSym<E> &esym) {
+    // Dynamic symbol table for SPARC contains bogus entries which
+    // we need to ignore
+    if constexpr (is_sparc<E>)
+      return esym.st_type == STT_SPARC_REGISTER;
+    return false;
+  };
+
+  // Obtain a list of known shared library names.
+  std::unordered_set<std::string_view> sonames;
+  for (SharedFile<E> *file : ctx.dsos)
+    sonames.insert(file->soname);
+
+  tbb::parallel_for_each(ctx.dsos, [&](SharedFile<E> *file) {
+    // Skip the file if it depends on a file that we know nothing about.
+    // This is because missing symbols may be provided by that unknown file.
+    for (std::string_view needed : file->get_dt_needed(ctx))
+      if (sonames.count(needed) == 0)
+        return;
+
+    // Check if all undefined symbols have been resolved.
+    for (i64 i = 0; i < file->elf_syms.size(); i++) {
+      const ElfSym<E> &esym = file->elf_syms[i];
+      Symbol<E> &sym = *file->symbols[i];
+      if (esym.is_undef() && !esym.is_weak() && !sym.file &&
+          !is_sparc_register(esym))
+        Error(ctx) << *file << ": --no-allow-shlib-undefined: undefined symbol: "
+                   << sym;
+    }
+  });
+}
+
 template <typename E>
 void check_symbol_types(Context<E> &ctx) {
   Timer t(ctx, "check_symbol_types");
@@ -1138,14 +1092,11 @@ void check_symbol_types(Context<E> &ctx) {
   append(files, ctx.dsos);
 
   auto canonicalize = [](u32 ty) -> u32 {
-    switch (ty) {
-    case STT_GNU_IFUNC:
+    if (ty == STT_GNU_IFUNC)
       return STT_FUNC;
-    case STT_COMMON:
+    if (ty == STT_COMMON)
       return STT_OBJECT;
-    default:
-      return ty;
-    }
+    return ty;
   };
 
   tbb::parallel_for_each(files.begin(), files.end(), [&](InputFile<E> *file) {
@@ -1205,6 +1156,11 @@ template <typename E>
 void sort_init_fini(Context<E> &ctx) {
   Timer t(ctx, "sort_init_fini");
 
+  struct Entry {
+    InputSection<E> *sect;
+    i64 prio;
+  };
+
   for (Chunk<E> *chunk : ctx.chunks) {
     if (OutputSection<E> *osec = chunk->to_osec()) {
       if (osec->name == ".init_array" || osec->name == ".preinit_array" ||
@@ -1212,19 +1168,20 @@ void sort_init_fini(Context<E> &ctx) {
         if (ctx.arg.shuffle_sections == SHUFFLE_SECTIONS_REVERSE)
           std::reverse(osec->members.begin(), osec->members.end());
 
-        std::unordered_map<InputSection<E> *, i64> map;
+        std::vector<Entry> vec;
 
         for (InputSection<E> *isec : osec->members) {
           std::string_view name = isec->name();
           if (name.starts_with(".ctors") || name.starts_with(".dtors"))
-            map.insert({isec, 65535 - get_ctor_dtor_priority(isec)});
+            vec.push_back({isec, 65535 - get_ctor_dtor_priority(isec)});
           else
-            map.insert({isec, get_init_fini_priority(isec)});
+            vec.push_back({isec, get_init_fini_priority(isec)});
         }
 
-        sort(osec->members, [&](InputSection<E> *a, InputSection<E> *b) {
-          return map[a] < map[b];
-        });
+        sort(vec, [&](const Entry &a, const Entry &b) { return a.prio < b.prio; });
+
+        for (i64 i = 0; i < vec.size(); i++)
+          osec->members[i] = vec[i].sect;
       }
     }
   }
@@ -1234,19 +1191,25 @@ template <typename E>
 void sort_ctor_dtor(Context<E> &ctx) {
   Timer t(ctx, "sort_ctor_dtor");
 
+  struct Entry {
+    InputSection<E> *sect;
+    i64 prio;
+  };
+
   for (Chunk<E> *chunk : ctx.chunks) {
     if (OutputSection<E> *osec = chunk->to_osec()) {
       if (osec->name == ".ctors" || osec->name == ".dtors") {
         if (ctx.arg.shuffle_sections != SHUFFLE_SECTIONS_REVERSE)
           std::reverse(osec->members.begin(), osec->members.end());
 
-        std::unordered_map<InputSection<E> *, i64> map;
+        std::vector<Entry> vec;
         for (InputSection<E> *isec : osec->members)
-          map.insert({isec, get_ctor_dtor_priority(isec)});
+          vec.push_back({isec, get_ctor_dtor_priority(isec)});
 
-        sort(osec->members, [&](InputSection<E> *a, InputSection<E> *b) {
-          return map[a] < map[b];
-        });
+        sort(vec, [&](const Entry &a, const Entry &b) { return a.prio < b.prio; });
+
+        for (i64 i = 0; i < vec.size(); i++)
+          osec->members[i] = vec[i].sect;
       }
     }
   }
@@ -1286,10 +1249,13 @@ void fixup_ctors_in_init_array(Context<E> &ctx) {
     }
   };
 
-  if (OutputSection<E> *osec = find_section(ctx, ".init_array"))
-    fixup(*osec);
-  if (OutputSection<E> *osec = find_section(ctx, ".fini_array"))
-    fixup(*osec);
+  if (Chunk<E> *chunk = find_chunk(ctx, ".init_array"))
+    if (OutputSection<E> *osec = chunk->to_osec())
+      fixup(*osec);
+
+  if (Chunk<E> *chunk = find_chunk(ctx, ".fini_array"))
+    if (OutputSection<E> *osec = chunk->to_osec())
+      fixup(*osec);
 }
 
 template <typename T>
@@ -1358,76 +1324,22 @@ template <typename E>
 void compute_section_sizes(Context<E> &ctx) {
   Timer t(ctx, "compute_section_sizes");
 
-  struct Group {
-    i64 size = 0;
-    i64 p2align = 0;
-    i64 offset = 0;
-    std::span<InputSection<E> *> members;
-  };
-
-  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-    OutputSection<E> *osec = chunk->to_osec();
-    if (!osec)
-      return;
-
-    // This pattern will be processed in the next loop.
-    if constexpr (needs_thunk<E>)
-      if ((osec->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable)
-        return;
-
-    // Since one output section may contain millions of input sections,
-    // we first split input sections into groups and assign offsets to
-    // groups.
-    std::vector<Group> groups;
-    constexpr i64 group_size = 10000;
-
-    for (std::span<InputSection<E> *> span : split(osec->members, group_size))
-      groups.push_back(Group{.members = span});
+  if constexpr (needs_thunk<E>) {
+    // We cannot use parallel-for for compute_section_size() which may
+    // call create_range_extension_thunks() because that function is
+    // not thread-safe.
+    for (Chunk<E> *chunk : ctx.chunks)
+      if (chunk->shdr.sh_flags & SHF_EXECINSTR)
+        chunk->compute_section_size(ctx);
 
-    tbb::parallel_for_each(groups, [](Group &group) {
-      for (InputSection<E> *isec : group.members) {
-        group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
-        group.p2align = std::max<i64>(group.p2align, isec->p2align);
-      }
+    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+      if (!(chunk->shdr.sh_flags & SHF_EXECINSTR))
+        chunk->compute_section_size(ctx);
     });
-
-    ElfShdr<E> &shdr = osec->shdr;
-    shdr.sh_size = 0;
-
-    for (i64 i = 0; i < groups.size(); i++) {
-      shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
-      groups[i].offset = shdr.sh_size;
-      shdr.sh_size += groups[i].size;
-      shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
-    }
-
-    // Assign offsets to input sections.
-    tbb::parallel_for_each(groups, [](Group &group) {
-      i64 offset = group.offset;
-      for (InputSection<E> *isec : group.members) {
-        offset = align_to(offset, 1 << isec->p2align);
-        isec->offset = offset;
-        offset += isec->sh_size;
-      }
+  } else {
+    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+      chunk->compute_section_size(ctx);
     });
-  });
-
-  // On ARM32 or ARM64, we may need to create so-called "range extension
-  // thunks" to extend branch instructions reach, as they can jump only
-  // to ±16 MiB or ±128 MiB, respecitvely.
-  //
-  // In the following loop, We compute the sizes of sections while
-  // inserting thunks. This pass cannot be parallelized. That is,
-  // create_range_extension_thunks is parallelized internally, but the
-  // function itself is not thread-safe.
-  if constexpr (needs_thunk<E>) {
-    Timer t2(ctx, "create_range_extension_thunks");
-
-    if (!ctx.arg.relocatable)
-      for (Chunk<E> *chunk : ctx.chunks)
-        if (OutputSection<E> *osec = chunk->to_osec())
-          if (osec->shdr.sh_flags & SHF_EXECINSTR)
-            osec->create_range_extension_thunks(ctx);
   }
 }
 
@@ -1535,6 +1447,14 @@ void scan_relocations(Context<E> &ctx) {
     file->scan_relocations(ctx);
   });
 
+  // Word-size absolute relocations (e.g. R_X86_64_64) are handled
+  // separately because they can be promoted to dynamic relocations.
+  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+    if (OutputSection<E> *osec = chunk->to_osec())
+      if (osec->shdr.sh_flags & SHF_ALLOC)
+        osec->scan_abs_relocations(ctx);
+  });
+
   // Exit if there was a relocation that refers an undefined symbol.
   ctx.checkpoint();
 
@@ -1608,9 +1528,6 @@ void scan_relocations(Context<E> &ctx) {
     sym->flags = 0;
   }
 
-  if constexpr (is_alpha<E>)
-    ctx.extra.got->finalize();
-
   if (ctx.has_textrel && ctx.arg.warn_textrel)
     Warn(ctx) << "creating a DT_TEXTREL in an output file";
 }
@@ -1702,13 +1619,21 @@ void copy_chunks(Context<E> &ctx) {
   // For --relocatable and --emit-relocs, we want to copy non-relocation
   // sections first. This is because REL-type relocation sections (as
   // opposed to RELA-type) stores relocation addends to target sections.
+  //
+  // We also does that for SH4 because despite being RELA, we always need
+  // to write addends to relocated places for SH4.
+  auto is_rel = [](Chunk<E> &chunk) {
+    return chunk.shdr.sh_type == SHT_REL ||
+           (is_sh4<E> && chunk.shdr.sh_type == SHT_RELA);
+  };
+
   tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-    if (chunk->shdr.sh_type != SHT_REL)
+    if (!is_rel(*chunk))
       copy(*chunk);
   });
 
   tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-    if (chunk->shdr.sh_type == SHT_REL)
+    if (is_rel(*chunk))
       copy(*chunk);
   });
 
@@ -1718,62 +1643,21 @@ void copy_chunks(Context<E> &ctx) {
   // undefined errors.
   report_undef_errors(ctx);
 
-  if constexpr (is_arm32<E>)
-    fixup_arm_exidx_section(ctx);
-}
+  // Zero-clear paddings between chunks
+  auto zero = [&](Chunk<E> *chunk, i64 next_start) {
+    i64 pos = chunk->shdr.sh_offset + chunk->shdr.sh_size;
+    memset(ctx.buf + pos, 0, next_start - pos);
+  };
 
-// Rewrite the leading endbr64 instruction with a nop if a function
-// symbol's address was not taken.
-template <typename E>
-void rewrite_endbr(Context<E> &ctx) {
-  Timer t(ctx, "rewrite_endbr");
-  assert(is_x86_64<E>);
+  std::vector<Chunk<E> *> chunks = ctx.chunks;
 
-  // Compute address-taken bit for each symbol
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
-      if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC)) {
-        for (const ElfRel<E> &rel : isec->get_rels(ctx)) {
-          Symbol<E> &sym = *file->symbols[rel.r_sym];
-          if (!is_func_call_rel(rel) && sym.esym().st_type == STT_FUNC) {
-            std::scoped_lock lock(sym.mu);
-            sym.address_taken = true;
-          }
-        }
-      }
-    }
+  std::erase_if(chunks, [](Chunk<E> *chunk) {
+    return chunk->shdr.sh_type == SHT_NOBITS;
   });
 
-  // Exported symbols are conservatively assumed to be address-taken.
-  if (ctx.dynsym)
-    for (Symbol<E> *sym : ctx.dynsym->symbols)
-      if (sym && sym->is_exported)
-        sym->address_taken = true;
-
-  // Some symbols are implicitly address-taken
-  ctx.arg.entry->address_taken = true;
-  ctx.arg.init->address_taken = true;
-  ctx.arg.fini->address_taken = true;
-
-  // Rewrite endbr64 with nop
-  u8 endbr64[] = {0xf3, 0x0f, 0x1e, 0xfa};
-  u8 nop[] = {0x0f, 0x1f, 0x40, 0x00};
-
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    for (Symbol<E> *sym : file->symbols) {
-      if (sym->file == file && sym->esym().st_type == STT_FUNC &&
-          !sym->address_taken) {
-        if (InputSection<E> *isec = sym->get_input_section()) {
-          if (OutputSection<E> *osec = isec->output_section) {
-            u8 *buf = ctx.buf + osec->shdr.sh_offset + isec->offset +
-                      sym->value;
-            if (memcmp(buf, endbr64, 4) == 0)
-              memcpy(buf, nop, 4);
-          }
-        }
-      }
-    }
-  });
+  for (i64 i = 1; i < chunks.size(); i++)
+    zero(chunks[i - 1], chunks[i]->shdr.sh_offset);
+  zero(chunks.back(), ctx.output_file->filesize);
 }
 
 template <typename E>
@@ -1785,16 +1669,77 @@ void construct_relr(Context<E> &ctx) {
   });
 }
 
+// The hash function for .gnu.hash.
+static u32 djb_hash(std::string_view name) {
+  u32 h = 5381;
+  for (u8 c : name)
+    h = (h << 5) + h + c;
+  return h;
+}
+
 template <typename E>
-void create_output_symtab(Context<E> &ctx) {
-  Timer t(ctx, "compute_symtab_size");
+void sort_dynsyms(Context<E> &ctx) {
+  Timer t(ctx, "sort_dynsyms");
 
-  if (!ctx.arg.strip_all && !ctx.arg.retain_symbols_file) {
-    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-      chunk->compute_symtab_size(ctx);
+  std::span<Symbol<E> *> syms = ctx.dynsym->symbols;
+  if (syms.empty())
+    return;
+
+  // In any symtab, local symbols must precede global symbols.
+  auto first_global = std::stable_partition(syms.begin() + 1, syms.end(),
+                                            [&](Symbol<E> *sym) {
+    return sym->is_local(ctx);
+  });
+
+  // .gnu.hash imposes more restrictions on the order of the symbols in
+  // .dynsym.
+  if (ctx.gnu_hash) {
+    auto first_exported = std::stable_partition(first_global, syms.end(),
+                                                [&](Symbol<E> *sym) {
+      return !sym->is_exported;
+    });
+
+    // Count the number of exported symbols to compute the size of .gnu.hash.
+    i64 num_exported = syms.end() - first_exported;
+    u32 num_buckets = num_exported / ctx.gnu_hash->LOAD_FACTOR + 1;
+
+    tbb::parallel_for_each(first_exported, syms.end(), [&](Symbol<E> *sym) {
+      sym->set_djb_hash(ctx, djb_hash(sym->name()));
+    });
+
+    tbb::parallel_sort(first_exported, syms.end(),
+                       [&](Symbol<E> *a, Symbol<E> *b) {
+      return std::tuple(a->get_djb_hash(ctx) % num_buckets, a->name()) <
+             std::tuple(b->get_djb_hash(ctx) % num_buckets, b->name());
     });
+
+    ctx.gnu_hash->num_buckets = num_buckets;
+    ctx.gnu_hash->num_exported = num_exported;
   }
 
+  // Compute .dynstr size
+  ctx.dynstr->dynsym_offset = ctx.dynstr->shdr.sh_size;
+
+  tbb::enumerable_thread_specific<i64> size;
+  tbb::parallel_for((i64)1, (i64)syms.size(), [&](i64 i) {
+    syms[i]->set_dynsym_idx(ctx, i);
+    size.local() += syms[i]->name().size() + 1;
+  });
+
+  ctx.dynstr->shdr.sh_size += size.combine(std::plus());
+
+  // ELF's symbol table sh_info holds the offset of the first global symbol.
+  ctx.dynsym->shdr.sh_info = first_global - syms.begin();
+}
+
+template <typename E>
+void create_output_symtab(Context<E> &ctx) {
+  Timer t(ctx, "compute_symtab_size");
+
+  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+    chunk->compute_symtab_size(ctx);
+  });
+
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
     file->compute_symtab_size(ctx);
   });
@@ -1951,6 +1896,9 @@ static bool should_export(Context<E> &ctx, Symbol<E> &sym) {
 
   switch (sym.ver_idx) {
   case VER_NDX_UNSPECIFIED:
+    if (ctx.arg.dynamic_list_data)
+      if (u32 ty = sym.get_type(); ty != STT_FUNC && ty != STT_GNU_IFUNC)
+        return true;
     if (ctx.arg.shared)
       return !((ObjectFile<E> *)sym.file)->exclude_libs;
     return ctx.arg.export_dynamic;
@@ -2168,26 +2116,6 @@ void compute_address_significance(Context<E> &ctx) {
   });
 }
 
-template <typename E>
-void clear_padding(Context<E> &ctx) {
-  Timer t(ctx, "clear_padding");
-
-  auto zero = [&](Chunk<E> *chunk, i64 next_start) {
-    i64 pos = chunk->shdr.sh_offset + chunk->shdr.sh_size;
-    memset(ctx.buf + pos, 0, next_start - pos);
-  };
-
-  std::vector<Chunk<E> *> chunks = ctx.chunks;
-
-  std::erase_if(chunks, [](Chunk<E> *chunk) {
-    return chunk->shdr.sh_type == SHT_NOBITS;
-  });
-
-  for (i64 i = 1; i < chunks.size(); i++)
-    zero(chunks[i - 1], chunks[i]->shdr.sh_offset);
-  zero(chunks.back(), ctx.output_file->filesize);
-}
-
 // We want to sort output chunks in the following order.
 //
 //   <ELF header>
@@ -2209,7 +2137,6 @@ void clear_padding(Context<E> &ctx) {
 //   <writable RELRO data>
 //   .got
 //   .toc
-//   .alpha_got
 //   <writable RELRO bss>
 //   .relro_padding
 //   <writable non-RELRO data>
@@ -2302,8 +2229,6 @@ void sort_output_sections_regular(Context<E> &ctx) {
       return 2;
     if (chunk->name == ".toc")
       return 3;
-    if (chunk->name == ".alpha_got")
-      return 4;
     if (chunk == ctx.relro_padding)
       return INT64_MAX;
     return 0;
@@ -2380,11 +2305,6 @@ void sort_output_sections(Context<E> &ctx) {
     sort_output_sections_by_order(ctx);
 }
 
-template <typename E>
-static bool is_tbss(Chunk<E> *chunk) {
-  return (chunk->shdr.sh_type == SHT_NOBITS) && (chunk->shdr.sh_flags & SHF_TLS);
-}
-
 // This function assigns virtual addresses to output sections. Assigning
 // addresses is a bit tricky because we want to pack sections as tightly
 // as possible while not violating the constraints imposed by the hardware
@@ -2450,6 +2370,10 @@ static void set_virtual_addresses_regular(Context<E> &ctx) {
     return chunk == first_tls_chunk ? tls_alignment : (u64)chunk->shdr.sh_addralign;
   };
 
+  auto is_tbss = [](Chunk<E> *chunk) {
+    return (chunk->shdr.sh_type == SHT_NOBITS) && (chunk->shdr.sh_flags & SHF_TLS);
+  };
+
   for (i64 i = 0; i < chunks.size(); i++) {
     if (!(chunks[i]->shdr.sh_flags & SHF_ALLOC))
       continue;
@@ -2679,6 +2603,24 @@ static i64 set_file_offsets(Context<E> &ctx) {
   return fileoff;
 }
 
+// Remove debug sections from ctx.chunks and save them to ctx.debug_chunks.
+// This is for --separate-debug-file.
+template <typename E>
+void separate_debug_sections(Context<E> &ctx) {
+  auto is_debug_section = [&](Chunk<E> *chunk) {
+    if (chunk->shdr.sh_flags & SHF_ALLOC)
+      return false;
+    return chunk == ctx.gdb_index || chunk == ctx.symtab || chunk == ctx.strtab ||
+           chunk->name.starts_with(".debug_");
+  };
+
+  auto mid = std::stable_partition(ctx.chunks.begin(), ctx.chunks.end(),
+                                   is_debug_section);
+
+  ctx.debug_chunks = {ctx.chunks.begin(), mid};
+  ctx.chunks.erase(ctx.chunks.begin(), mid);
+}
+
 template <typename E>
 void compute_section_headers(Context<E> &ctx) {
   // Update sh_size for each chunk.
@@ -2816,7 +2758,7 @@ void fix_synthetic_symbols(Context<E> &ctx) {
   // If we set values to these symbols in a static PIE, glibc attempts
   // to run ifunc initializers twice, with the second attempt with wrong
   // function addresses, causing a segmentation fault.
-  if (ctx.reldyn && ctx.arg.is_static && !ctx.arg.pie) {
+  if (ctx.reldyn && ctx.arg.static_ && !ctx.arg.pie) {
     stop(ctx.__rel_iplt_start, ctx.reldyn);
     stop(ctx.__rel_iplt_end, ctx.reldyn);
 
@@ -2973,7 +2915,7 @@ void fix_synthetic_symbols(Context<E> &ctx) {
 }
 
 template <typename E>
-i64 compress_debug_sections(Context<E> &ctx) {
+void compress_debug_sections(Context<E> &ctx) {
   Timer t(ctx, "compress_debug_sections");
 
   tbb::parallel_for((i64)0, (i64)ctx.chunks.size(), [&](i64 i) {
@@ -2995,8 +2937,179 @@ i64 compress_debug_sections(Context<E> &ctx) {
     ctx.ehdr->update_shdr(ctx);
   if (ctx.shdr)
     ctx.shdr->update_shdr(ctx);
+}
+
+// BLAKE3 is a cryptographic hash function just like SHA256.
+// We use it instead of SHA256 because it's faster.
+static void blake3_hash(u8 *buf, i64 size, u8 *out) {
+  blake3_hasher hasher;
+  blake3_hasher_init(&hasher);
+  blake3_hasher_update(&hasher, buf, size);
+  blake3_hasher_finalize(&hasher, out, BLAKE3_OUT_LEN);
+}
+
+template <typename E>
+std::vector<std::span<u8>> get_shards(Context<E> &ctx) {
+  constexpr i64 shard_size = 4 * 1024 * 1024; // 4 MiB
+  std::span<u8> buf = {ctx.buf, (size_t)ctx.output_file->filesize};
+  std::vector<std::span<u8>> vec;
+
+  while (!buf.empty()) {
+    i64 sz = std::min<i64>(shard_size, buf.size());
+    vec.push_back(buf.subspan(0, sz));
+    buf = buf.subspan(sz);
+  }
+  return vec;
+}
+
+template <typename E>
+void write_build_id(Context<E> &ctx) {
+  Timer t(ctx, "write_build_id");
+
+  switch (ctx.arg.build_id.kind) {
+  case BuildId::HEX:
+    ctx.buildid->contents = ctx.arg.build_id.value;
+    break;
+  case BuildId::HASH: {
+    std::vector<std::span<u8>> shards = get_shards(ctx);
+    std::vector<u8> hashes(shards.size() * BLAKE3_OUT_LEN);
+
+    tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) {
+      blake3_hash(shards[i].data(), shards[i].size(),
+                  hashes.data() + i * BLAKE3_OUT_LEN);
+
+#ifdef HAVE_MADVISE
+      // Make the kernel page out the file contents we've just written
+      // so that subsequent close(2) call will become quicker.
+      if (i > 0 && ctx.output_file->is_mmapped)
+        madvise(begin, end - begin, MADV_DONTNEED);
+#endif
+    });
+
+    u8 buf[BLAKE3_OUT_LEN];
+    blake3_hash(hashes.data(), hashes.size(), buf);
+
+    assert(ctx.arg.build_id.size() <= BLAKE3_OUT_LEN);
+    ctx.buildid->contents = {buf, buf + ctx.arg.build_id.size()};
+    break;
+  }
+  case BuildId::UUID: {
+    u8 buf[16];
+    get_random_bytes(buf, 16);
+
+    // Indicate that this is UUIDv4 as defined by RFC4122
+    buf[6] = (buf[6] & 0b0000'1111) | 0b0100'0000;
+    buf[8] = (buf[8] & 0b0011'1111) | 0b1000'0000;
+    ctx.buildid->contents = {buf, buf + 16};
+    break;
+  }
+  default:
+    unreachable();
+  }
+
+  ctx.buildid->copy_buf(ctx);
+}
+
+// A .gnu_debuglink section contains a filename and a CRC32 checksum of a
+// debug info file. When we are writing a .gnu_debuglink, we don't know
+// its CRC32 checksum because we haven't created a debug info file. So we
+// write a dummy value instead.
+//
+// We can't choose a random value as a dummy value for build
+// reproducibility. We also don't want to write a fixed value for all
+// files because the CRC checksum is in this section to prevent using
+// wrong file on debugging. gdb rejects a debug info file if its CRC
+// doesn't match with the one in .gdb_debuglink.
+//
+// Therefore, we'll try to make our CRC checksum as unique as possible.
+// We'll remember that checksum, and after creating a debug info file, add
+// a few bytes of garbage at the end of it so that the debug info file's
+// CRC checksum becomes the one that we have precomputed.
+template <typename E>
+void write_gnu_debuglink(Context<E> &ctx) {
+  Timer t(ctx, "write_gnu_debuglink");
+  u32 crc32;
+
+  if (ctx.buildid) {
+    crc32 = compute_crc32(0, ctx.buildid->contents.data(),
+                          ctx.buildid->contents.size());
+  } else {
+    std::vector<std::span<u8>> shards = get_shards(ctx);
+    std::vector<U64<E>> hashes(shards.size());
+
+    tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) {
+      hashes[i] = hash_string({(char *)shards[i].data(), shards[i].size()});
+    });
+    crc32 = compute_crc32(0, (u8 *)hashes.data(), hashes.size() * 8);
+  }
+
+  ctx.gnu_debuglink->crc32 = crc32;
+  ctx.gnu_debuglink->copy_buf(ctx);
+}
+
+// Write a separate debug file. This function is called after we finish
+// writing to the usual output file.
+template <typename E>
+void write_separate_debug_file(Context<E> &ctx) {
+  Timer t(ctx, "write_separate_debug_file");
+
+  // Open an output file early
+  LockingOutputFile<E> *file =
+    new LockingOutputFile<E>(ctx, ctx.arg.separate_debug_file, 0666);
+
+  // We want to write to the debug info file in background so that the
+  // user doesn't have to wait for it to complete.
+  if (ctx.arg.detach)
+    notify_parent();
+
+  // A debug info file contains all sections as the original file, though
+  // most of them can be empty as if they were bss sections. We convert
+  // real sections into dummy sections here.
+  for (i64 i = 0; i < ctx.chunks.size(); i++) {
+    Chunk<E> *chunk = ctx.chunks[i];
+    if (chunk != ctx.ehdr && chunk != ctx.shdr && chunk != ctx.shstrtab &&
+        chunk->shdr.sh_type != SHT_NOTE) {
+      Chunk<E> *sec = new OutputSection<E>(chunk->name, SHT_NULL);
+      sec->shdr = chunk->shdr;
+      sec->shdr.sh_type = SHT_NOBITS;
+
+      ctx.chunks[i] = sec;
+      ctx.chunk_pool.emplace_back(sec);
+    }
+  }
+
+  // Restore debug info sections that had been set aside while we were
+  // creating the main file.
+  tbb::parallel_for_each(ctx.debug_chunks, [&](Chunk<E> *chunk) {
+    chunk->compute_section_size(ctx);
+  });
+
+  append(ctx.chunks, ctx.debug_chunks);
+
+  // Write to the debug info file as if it were a regular output file.
+  compute_section_headers(ctx);
+  file->resize(ctx, set_osec_offsets(ctx));
+
+  ctx.output_file.reset(file);
+  ctx.buf = ctx.output_file->buf;
+
+  copy_chunks(ctx);
+
+  if (ctx.gdb_index)
+    write_gdb_index(ctx);
+
+  // Reverse-compute a CRC32 value so that the CRC32 checksum embedded to
+  // the .gnu_debuglink section in the main executable matches with the
+  // debug info file's CRC32 checksum.
+  u32 crc = compute_crc32(0, ctx.buf, ctx.output_file->filesize);
+
+  std::vector<u8> &buf2 = ctx.output_file->buf2;
+  if (!buf2.empty())
+    crc = compute_crc32(crc, buf2.data(), buf2.size());
 
-  return set_osec_offsets(ctx);
+  std::vector<u8> trailer = crc32_solve(crc, ctx.gnu_debuglink->crc32);
+  append(ctx.output_file->buf2, trailer);
+  ctx.output_file->close(ctx);
 }
 
 // Write Makefile-style dependency rules to a file specified by
@@ -3102,11 +3215,10 @@ template void create_internal_file(Context<E> &);
 template void apply_exclude_libs(Context<E> &);
 template void create_synthetic_sections(Context<E> &);
 template void resolve_symbols(Context<E> &);
-template void kill_eh_frame_sections(Context<E> &);
-template void split_section_pieces(Context<E> &);
-template void resolve_section_pieces(Context<E> &);
+template void do_lto(Context<E> &);
+template void parse_eh_frame_sections(Context<E> &);
+template void create_merged_sections(Context<E> &);
 template void convert_common_symbols(Context<E> &);
-template void compute_merged_section_sizes(Context<E> &);
 template void create_output_sections(Context<E> &);
 template void add_synthetic_symbols(Context<E> &);
 template void check_cet_errors(Context<E> &);
@@ -3114,6 +3226,7 @@ template void apply_section_align(Context<E> &);
 template void print_dependencies(Context<E> &);
 template void write_repro_file(Context<E> &);
 template void check_duplicate_symbols(Context<E> &);
+template void check_shlib_undefined(Context<E> &);
 template void check_symbol_types(Context<E> &);
 template void sort_init_fini(Context<E> &);
 template void sort_ctor_dtor(Context<E> &);
@@ -3127,19 +3240,22 @@ template void scan_relocations(Context<E> &);
 template void report_undef_errors(Context<E> &);
 template void create_reloc_sections(Context<E> &);
 template void copy_chunks(Context<E> &);
-template void rewrite_endbr(Context<E> &);
 template void construct_relr(Context<E> &);
+template void sort_dynsyms(Context<E> &);
 template void create_output_symtab(Context<E> &);
 template void apply_version_script(Context<E> &);
 template void parse_symbol_version(Context<E> &);
 template void compute_import_export(Context<E> &);
 template void compute_address_significance(Context<E> &);
-template void clear_padding(Context<E> &);
+template void separate_debug_sections(Context<E> &);
 template void compute_section_headers(Context<E> &);
 template i64 set_osec_offsets(Context<E> &);
 template void fix_synthetic_symbols(Context<E> &);
-template i64 compress_debug_sections(Context<E> &);
+template void compress_debug_sections(Context<E> &);
+template void write_build_id(Context<E> &);
+template void write_gnu_debuglink(Context<E> &);
+template void write_separate_debug_file(Context<E> &);
 template void write_dependency_file(Context<E> &);
 template void show_stats(Context<E> &);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/elf/relocatable.cc b/src/relocatable.cc
similarity index 96%
rename from elf/relocatable.cc
rename to src/relocatable.cc
index 01bf6d39..639dc6ae 100644
--- a/elf/relocatable.cc
+++ b/src/relocatable.cc
@@ -35,7 +35,7 @@
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_for_each.h>
 
-namespace mold::elf {
+namespace mold {
 
 // Create linker-synthesized sections
 template <typename E>
@@ -148,8 +148,6 @@ static u64 r_set_osec_offsets(Context<E> &ctx) {
 
 template <typename E>
 void combine_objects(Context<E> &ctx) {
-  compute_merged_section_sizes(ctx);
-
   create_output_sections(ctx);
 
   r_create_synthetic_sections(ctx);
@@ -171,12 +169,10 @@ void combine_objects(Context<E> &ctx) {
   compute_section_headers(ctx);
 
   i64 filesize = r_set_osec_offsets(ctx);
-  ctx.output_file =
-    OutputFile<Context<E>>::open(ctx, ctx.arg.output, filesize, 0666);
+  ctx.output_file = OutputFile<E>::open(ctx, ctx.arg.output, filesize, 0666);
   ctx.buf = ctx.output_file->buf;
 
   copy_chunks(ctx);
-  clear_padding(ctx);
   ctx.output_file->close(ctx);
   ctx.checkpoint();
 
@@ -197,4 +193,4 @@ using E = MOLD_TARGET;
 
 template void combine_objects(Context<E> &);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/src/shrink-sections.cc b/src/shrink-sections.cc
new file mode 100644
index 00000000..cfd3f4b3
--- /dev/null
+++ b/src/shrink-sections.cc
@@ -0,0 +1,151 @@
+// Since RISC instructions are generally up to 32 bits long, there's no
+// way to embed very large immediates into their branch instructions. For
+// example, RISC-V's JAL (jump and link) instruction can jump to only
+// within PC ± 1 MiB because its immediate is 21 bits long. If the
+// destination is further than that, we need to use two instructions
+// instead; the first instruction being AUIPC, which sets the upper 20
+// bits of a displacement to a register, and the second being JALR, which
+// specifies the lower 12 bits and the register. Combined, they specify a
+// 32-bit displacement, which is sufficient to support the medium code
+// model.
+//
+// However, always using two or more instructions for function calls is a
+// waste of time and space if the branch target is within a single
+// instruction's reach. There are two approaches to address this problem
+// as follows:
+//
+//  1. The compiler optimistically emits a single branch instruction for
+//     all function calls. The linker then checks if the branch target is
+//     reachable, and if not, redirects the branch to a linker-synthesized
+//     code sequence that uses two or more instructions to branch further.
+//     That linker-synthesized code is called a "thunk". All RISC psABIs
+//     except RISC-V and LoongArch take this approach.
+//
+//  2. The compiler pessimistically emits two instructions to branch
+//     anywhere in PC ± 2 GiB, and the linker rewrites them with a single
+//     instruction if the branch target is close enough. RISC-V and
+//     LoongArch take this approach.
+//
+// This file contains functions to support (2). For (1), see thunks.cc.
+//
+// With the presence of this code-shrinking relaxation, sections can no
+// longer be considered as atomic units. If we delete an instruction from
+// the middle of a section, the section contents after that point need to
+// be shifted by the size of the instruction. Symbol values and relocation
+// offsets have to be shifted too if they refer to bytes past the deleted
+// ones.
+//
+// In mold, we use `r_deltas` to memorize how many bytes have been shifted
+// for relocations. For symbols, we directly mutate their `value` member.
+//
+// RISC-V and LoongArch object files tend to have way more relocations
+// than those for other targets. This is because all branches, including
+// those that jump within the same section, are explicitly expressed with
+// relocations. Here is why we need them: all control-flow statements,
+// such as `if` or `for`, are implemented using branch instructions. For
+// other targets, the compiler doesn't emit relocations for such branches
+// because it knows at compile-time exactly how many bytes have to be
+// skipped. That's not true in RISC-V and LoongArch because the linker may
+// delete bytes between a branch and its target. Therefore, all branches,
+// including in-section ones, have to be explicitly expressed with
+// relocations.
+//
+// Note that this mechanism only shrinks sections and never enlarges them,
+// as the compiler always emits the longest instruction sequence. This
+// makes the linker implementation a bit simpler because we don't need to
+// worry about oscillation.
+
+#if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE || \
+    MOLD_LOONGARCH64 || MOLD_LOONGARCH32
+
+#include "mold.h"
+
+#include <tbb/parallel_for_each.h>
+
+namespace mold {
+
+using E = MOLD_TARGET;
+
+static bool is_resizable(InputSection<E> *isec) {
+  return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) &&
+         (isec->shdr().sh_flags & SHF_EXECINSTR);
+}
+
+template <>
+void shrink_sections<E>(Context<E> &ctx) {
+  Timer t(ctx, "shrink_sections");
+
+  // True if we can use the 2-byte instructions. This is usually true on
+  // Unix because RV64GC is generally considered the baseline hardware.
+  bool use_rvc = false;
+  if constexpr (is_riscv<E>)
+    use_rvc = get_eflags(ctx) & EF_RISCV_RVC;
+
+  // Find all relaxable relocations and record how many bytes we can save
+  // into r_deltas.
+  //
+  // Technically speaking, relaxing relocations may allow more relocations
+  // to be relaxed because the distance between a branch instruction and
+  // its target may decrease as a result of relaxation. That said, the
+  // number of such relocations is negligible (I tried to self-host mold
+  // on RISC-V as an experiment and found that the mold-built .text is
+  // only ~0.04% larger than that of GNU ld), so we don't bother to handle
+  // them. We scan relocations only once here.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (std::unique_ptr<InputSection<E>> &isec : file->sections)
+      if (is_resizable(isec.get()))
+        shrink_section(ctx, *isec, use_rvc);
+  });
+
+  // Fix symbol values.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (Symbol<E> *sym : file->symbols) {
+      if (sym->file != file)
+        continue;
+
+      InputSection<E> *isec = sym->get_input_section();
+      if (!isec || isec->extra.r_deltas.empty())
+        continue;
+
+      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
+      auto it = std::lower_bound(rels.begin(), rels.end(), sym->value,
+                                 [&](const ElfRel<E> &r, u64 val) {
+        return r.r_offset < val;
+      });
+
+      sym->value -= isec->extra.r_deltas[it - rels.begin()];
+    }
+  });
+
+  // Recompute sizes of executable sections
+  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+    if (chunk->to_osec() && (chunk->shdr.sh_flags & SHF_EXECINSTR))
+      chunk->compute_section_size(ctx);
+  });
+}
+
+// Returns the distance between a relocated place and a symbol.
+template <>
+i64 compute_distance<E>(Context<E> &ctx, Symbol<E> &sym,
+                        InputSection<E> &isec, const ElfRel<E> &rel) {
+  // We handle absolute symbols as if they were infinitely far away
+  // because `shrink_section` may increase a distance between a branch
+  // instruction and an absolute symbol. Branching to an absolute
+  // location is extremely rare in real code, though.
+  if (sym.is_absolute())
+    return INT64_MAX;
+
+  // Likewise, relocations against weak undefined symbols won't be relaxed.
+  if (sym.esym().is_undef_weak())
+    return INT64_MAX;
+
+  // Compute a distance between the relocated place and the symbol.
+  i64 S = sym.get_addr(ctx);
+  i64 A = rel.r_addend;
+  i64 P = isec.get_addr() + rel.r_offset;
+  return S + A - P;
+}
+
+} // namespace mold
+
+#endif
diff --git a/elf/subprocess.cc b/src/subprocess-unix.cc
similarity index 88%
rename from elf/subprocess.cc
rename to src/subprocess-unix.cc
index 51be8972..44e5e65a 100644
--- a/elf/subprocess.cc
+++ b/src/subprocess-unix.cc
@@ -1,5 +1,3 @@
-#if !defined(_WIN32) && !defined(__APPLE__)
-
 #include "mold.h"
 #include "config.h"
 
@@ -11,13 +9,15 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
-namespace mold::elf {
+namespace mold {
 
 #ifdef MOLD_X86_64
+static int pipe_write_fd = -1;
+
 // Exiting from a program with large memory usage is slow --
 // it may take a few hundred milliseconds. To hide the latency,
 // we fork a child and let it do the actual linking work.
-std::function<void()> fork_child() {
+void fork_child() {
   int pipefd[2];
   if (pipe(pipefd) == -1) {
     perror("pipe");
@@ -50,12 +50,17 @@ std::function<void()> fork_child() {
 
   // Child
   close(pipefd[0]);
+  pipe_write_fd = pipefd[1];
+}
+
+void notify_parent() {
+  if (pipe_write_fd == -1)
+    return;
 
-  return [=] {
-    char buf[] = {1};
-    [[maybe_unused]] int n = write(pipefd[1], buf, 1);
-    assert(n == 1);
-  };
+  char buf[] = {1};
+  [[maybe_unused]] int n = write(pipe_write_fd, buf, 1);
+  assert(n == 1);
+  pipe_write_fd = -1;
 }
 #endif
 
@@ -84,6 +89,9 @@ static std::string find_dso(Context<E> &ctx, std::filesystem::path self) {
 template <typename E>
 [[noreturn]]
 void process_run_subcommand(Context<E> &ctx, int argc, char **argv) {
+#ifdef __APPLE__
+  Fatal(ctx) << "-run is not supported on macOS";
+#else
   assert(argv[1] == "-run"s || argv[1] == "--run"s);
 
   if (!argv[2])
@@ -111,12 +119,11 @@ void process_run_subcommand(Context<E> &ctx, int argc, char **argv) {
   // Execute a given command
   execvp(argv[2], argv + 2);
   Fatal(ctx) << "mold -run failed: " << argv[2] << ": " << errno_string();
+#endif
 }
 
 using E = MOLD_TARGET;
 
 template void process_run_subcommand(Context<E> &, int, char **);
 
-} // namespace mold::elf
-
-#endif
+} // namespace mold
diff --git a/src/subprocess-win32.cc b/src/subprocess-win32.cc
new file mode 100644
index 00000000..fb336827
--- /dev/null
+++ b/src/subprocess-win32.cc
@@ -0,0 +1,20 @@
+#include "mold.h"
+
+namespace mold {
+
+#ifdef MOLD_X86_64
+void fork_child() {}
+void notify_parent() {}
+#endif
+
+template <typename E>
+[[noreturn]]
+void process_run_subcommand(Context<E> &ctx, int argc, char **argv) {
+  Fatal(ctx) << "-run is supported only on Unix";
+}
+
+using E = MOLD_TARGET;
+
+template void process_run_subcommand(Context<E> &, int, char **);
+
+} // namespace mold
diff --git a/elf/thunks.cc b/src/thunks.cc
similarity index 92%
rename from elf/thunks.cc
rename to src/thunks.cc
index 26b0d15c..c5a99fbc 100644
--- a/elf/thunks.cc
+++ b/src/thunks.cc
@@ -20,15 +20,14 @@
 // we don't need to try too hard to reduce thunk size to the absolute
 // minimum.
 
-#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2 || \
-    MOLD_LOONGARCH64 || MOLD_LOONGARCH32
+#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2
 
 #include "mold.h"
 
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_for_each.h>
 
-namespace mold::elf {
+namespace mold {
 
 using E = MOLD_TARGET;
 
@@ -39,9 +38,7 @@ static consteval i64 max_distance() {
   // and therefore the least two bits are always zero. So the branch
   // operand is effectively 28 bits long. That means the branch range is
   // [-2^27, 2^27) or PC ± 128 MiB.
-  //
-  // LoongArch's BR instruction also takes a 26 bit immediate.
-  if (is_arm64<E> || is_loongarch<E>)
+  if (is_arm64<E>)
     return 1 << 27;
 
   // ARM32's Thumb branch has 24 bits immediate, and the instructions are
@@ -179,6 +176,7 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
   // haven't.
   for (InputSection<E> *isec : m)
     isec->offset = -1;
+  thunks.clear();
 
   // We create thunks from the beginning of the section to the end.
   // We manage progress using four offsets which increase monotonically.
@@ -247,10 +245,8 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
 
     // Scan relocations between B and C to collect symbols that need
     // entries in the new thunk.
-    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
-                           [&](InputSection<E> *isec) {
-      scan_rels(ctx, *isec, *thunk, thunk_idx);
-    });
+    for (i64 i = b; i < c; i++)
+      scan_rels(ctx, *m[i], *thunk, thunk_idx);
 
     // Now that we know the number of symbols in the thunk, we can compute
     // the thunk's size.
@@ -270,16 +266,15 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
     }
 
     // Scan relocations again to fix symbol offsets in the last thunk.
-    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
-                           [&](InputSection<E> *isec) {
-      std::span<Symbol<E> *> syms = isec->file.symbols;
-      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
-      std::span<ThunkRef> thunk_refs = isec->extra.thunk_refs;
-
-      for (i64 i = 0; i < rels.size(); i++)
-        if (thunk_refs[i].thunk_idx == thunk_idx)
-          thunk_refs[i].sym_idx = syms[rels[i].r_sym]->extra.thunk_sym_idx;
-    });
+    for (i64 i = b; i < c; i++) {
+      std::span<Symbol<E> *> syms = m[i]->file.symbols;
+      std::span<const ElfRel<E>> rels = m[i]->get_rels(ctx);
+      std::span<ThunkRef> thunk_refs = m[i]->extra.thunk_refs;
+
+      for (i64 j = 0; j < rels.size(); j++)
+        if (thunk_refs[j].thunk_idx == thunk_idx)
+          thunk_refs[j].sym_idx = syms[rels[j].r_sym]->extra.thunk_sym_idx;
+    }
 
     // Move B forward to point to the begining of the next batch.
     b = c;
@@ -295,6 +290,6 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
       std::max<u32>(this->shdr.sh_addralign, 1 << isec->p2align);
 }
 
-} // namespace mold::elf
+} // namespace mold
 
 #endif
diff --git a/elf/tls.cc b/src/tls.cc
similarity index 88%
rename from elf/tls.cc
rename to src/tls.cc
index 8d391ace..8d8476d2 100644
--- a/elf/tls.cc
+++ b/src/tls.cc
@@ -122,44 +122,26 @@
 
 #include "mold.h"
 
-namespace mold::elf {
-
-template <typename E>
-static ElfPhdr<E> *get_tls_segment(Context<E> &ctx) {
-  if (ctx.phdr)
-    for (ElfPhdr<E> &phdr : ctx.phdr->phdrs)
-      if (phdr.p_type == PT_TLS)
-        return &phdr;
-  return nullptr;
-}
-
-template <typename E>
-u64 get_tls_begin(Context<E> &ctx) {
-  if (ElfPhdr<E> *phdr = get_tls_segment(ctx))
-    return phdr->p_vaddr;
-  return 0;
-}
+namespace mold {
 
 // Returns the TP address which can be used for efficient TLV accesses in
 // the main executable. TP at runtime refers to a per-process TLS block
 // whose address is not known at link-time. So the address returned from
 // this function is the TP if the TLS template image were a TLS block.
 template <typename E>
-u64 get_tp_addr(Context<E> &ctx) {
-  ElfPhdr<E> *phdr = get_tls_segment(ctx);
-  if (!phdr)
-    return 0;
+u64 get_tp_addr(const ElfPhdr<E> &phdr) {
+  assert(phdr.p_type == PT_TLS);
 
   if constexpr (is_x86<E> || is_sparc<E> || is_s390x<E>) {
     // On x86, SPARC and s390x, TP (%gs on i386, %fs on x86-64, %g7 on SPARC
     // and %a0/%a1 on s390x) refers to past the end of the TLS block for
     // historical reasons. TLVs are accessed with negative offsets from TP.
-    return align_to(phdr->p_vaddr + phdr->p_memsz, phdr->p_align);
-  } else if constexpr (is_arm<E> || is_sh4<E> || is_alpha<E>) {
-    // On ARM, SH4 and Alpha, the runtime appends two words at the beginning
+    return align_to(phdr.p_vaddr + phdr.p_memsz, phdr.p_align);
+  } else if constexpr (is_arm<E> || is_sh4<E>) {
+    // On ARM and SH4, the runtime appends two words at the beginning
     // of TLV template image when copying TLVs to the TLS block, so we need
     // to offset it.
-    return align_down(phdr->p_vaddr - sizeof(Word<E>) * 2, phdr->p_align);
+    return align_down(phdr.p_vaddr - sizeof(Word<E>) * 2, phdr.p_align);
   } else if constexpr (is_ppc<E> || is_m68k<E>) {
     // On PowerPC and m68k, TP is 0x7000 (28 KiB) past the beginning
     // of the TLV block to maximize the addressable range of load/store
@@ -167,24 +149,22 @@ u64 get_tp_addr(Context<E> &ctx) {
     // (32 KiB) off because there's a small implementation-defined piece of
     // data before the initial TLV block, and the runtime wants to access
     // them efficiently too.
-    return phdr->p_vaddr + 0x7000;
+    return phdr.p_vaddr + 0x7000;
   } else {
     // RISC-V and LoongArch just uses the beginning of the main executable's
     // TLV block as TP. Their load/store instructions usually take 12-bits
     // signed immediates, so the beginning of the TLS block ± 2 KiB is
     // accessible with a single load/store instruction.
     static_assert(is_riscv<E> || is_loongarch<E>);
-    return phdr->p_vaddr;
+    return phdr.p_vaddr;
   }
 }
 
 // Returns the address __tls_get_addr() would return if it's called
 // with offset 0.
 template <typename E>
-u64 get_dtp_addr(Context<E> &ctx) {
-  ElfPhdr<E> *phdr = get_tls_segment(ctx);
-  if (!phdr)
-    return 0;
+u64 get_dtp_addr(const ElfPhdr<E> &phdr) {
+  assert(phdr.p_type == PT_TLS);
 
   if constexpr (is_ppc<E> || is_m68k<E>) {
     // On PowerPC and m68k, R_DTPOFF is resolved to the address 0x8000
@@ -193,21 +173,20 @@ u64 get_dtp_addr(Context<E> &ctx) {
     // immediates. That is, if the offset were right at the beginning of the
     // start of the TLS block, the half of addressible space (negative
     // immediates) would have been wasted.
-    return phdr->p_vaddr + 0x8000;
+    return phdr.p_vaddr + 0x8000;
   } else if constexpr (is_riscv<E>) {
     // On RISC-V, the bias is 0x800 as the load/store instructions in the
     // ISA usually have a 12-bit immediate.
-    return phdr->p_vaddr + 0x800;
+    return phdr.p_vaddr + 0x800;
   } else {
     // On other targets, DTP simply refers to the beginning of the TLS block.
-    return phdr->p_vaddr;
+    return phdr.p_vaddr;
   }
 }
 
 using E = MOLD_TARGET;
 
-template u64 get_tls_begin<E>(Context<E> &);
-template u64 get_tp_addr<E>(Context<E> &);
-template u64 get_dtp_addr<E>(Context<E> &);
+template u64 get_tp_addr(const ElfPhdr<E> &);
+template u64 get_dtp_addr(const ElfPhdr<E> &);
 
-} // namespace mold::elf
+} // namespace mold
diff --git a/test/elf/CMakeLists.txt b/test/CMakeLists.txt
similarity index 95%
rename from test/elf/CMakeLists.txt
rename to test/CMakeLists.txt
index 69a0cdae..e64a1f0e 100644
--- a/test/elf/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -31,7 +31,7 @@ endif()
 if(MOLD_ENABLE_QEMU_TESTS)
   list(APPEND QEMU_ARCHS
     x86_64 i386 arm aarch64 ppc ppc64 ppc64le sparc64 sh4 s390x
-    alpha riscv64 riscv32 m68k loongarch64)
+    riscv64 riscv32 m68k loongarch64)
 
   LIST(APPEND TRIPLES
     x86_64-linux-gnu
@@ -45,7 +45,6 @@ if(MOLD_ENABLE_QEMU_TESTS)
     sparc64-linux-gnu
     s390x-linux-gnu
     sh4-linux-gnu
-    alpha-linux-gnu
     riscv32-linux-gnu
     m68k-linux-gnu
     loongarch64-linux-gnu)
@@ -69,10 +68,10 @@ function(add_target ARCH TRIPLE)
   file(GLOB ALL_TESTS RELATIVE ${CMAKE_CURRENT_LIST_DIR} CONFIGURE_DEPENDS
     "*.sh")
 
-  list(FILTER ALL_TESTS EXCLUDE REGEX "_")
+  list(FILTER ALL_TESTS EXCLUDE REGEX "^arch-")
 
   file(GLOB TESTS RELATIVE ${CMAKE_CURRENT_LIST_DIR} CONFIGURE_DEPENDS
-    "${ARCH}_*.sh")
+    "arch-${ARCH}-*.sh")
 
   list(APPEND TESTS ${ALL_TESTS})
 
@@ -168,10 +167,6 @@ if(${MACHINE} STREQUAL "sh4" OR (HAS_qemu-sh4 AND HAS_sh4-linux-gnu-gcc))
   add_target(sh4 sh4-linux-gnu)
 endif()
 
-if(${MACHINE} STREQUAL "alpha" OR (HAS_qemu-alpha AND HAS_alpha-linux-gnu-gcc))
-  add_target(alpha alpha-linux-gnu)
-endif()
-
 if(${MACHINE} STREQUAL "m68k" OR (HAS_qemu-m68k AND HAS_m68k-linux-gnu-gcc))
   add_target(m68k m68k-linux-gnu)
 endif()
diff --git a/test/elf/abs-error.sh b/test/abs-error.sh
similarity index 94%
rename from test/elf/abs-error.sh
rename to test/abs-error.sh
index ca1cc1d7..65499c31 100755
--- a/test/elf/abs-error.sh
+++ b/test/abs-error.sh
@@ -5,7 +5,6 @@
 [ $MACHINE = ppc64 ] && skip
 [ $MACHINE = ppc64le ] && skip
 [ $MACHINE = s390x ] && skip
-[ $MACHINE = alpha ] && skip
 [[ $MACHINE = loongarch* ]] && skip
 
 cat <<EOF | $CC -fPIC -c -o $t/a.o -xassembler -
diff --git a/test/elf/absolute-symbols.sh b/test/absolute-symbols.sh
similarity index 93%
rename from test/elf/absolute-symbols.sh
rename to test/absolute-symbols.sh
index 75b6fbac..febd3e36 100755
--- a/test/elf/absolute-symbols.sh
+++ b/test/absolute-symbols.sh
@@ -8,7 +8,7 @@
 
 cat <<EOF | $CC -o $t/a.o -c -x assembler -
 .globl foo
-foo = 0x800008
+foo = 0xa00008
 EOF
 
 cat <<EOF | $CC -o $t/b.o -c -fno-PIC -xc -
@@ -36,4 +36,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe -no-pie $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '^ip=0x80000.$'
+$QEMU $t/exe | grep -q '^ip=0xa0000.$'
diff --git a/test/elf/allow-multiple-definition.sh b/test/allow-multiple-definition.sh
similarity index 100%
rename from test/elf/allow-multiple-definition.sh
rename to test/allow-multiple-definition.sh
diff --git a/test/elf/ar-alignment.sh b/test/ar-alignment.sh
similarity index 100%
rename from test/elf/ar-alignment.sh
rename to test/ar-alignment.sh
diff --git a/test/elf/aarch64_range-extension-thunk-disassembly.sh b/test/arch-aarch64-range-extension-thunk-disassembly.sh
similarity index 92%
rename from test/elf/aarch64_range-extension-thunk-disassembly.sh
rename to test/arch-aarch64-range-extension-thunk-disassembly.sh
index c6bb4648..4c7c5fce 100755
--- a/test/elf/aarch64_range-extension-thunk-disassembly.sh
+++ b/test/arch-aarch64-range-extension-thunk-disassembly.sh
@@ -1,8 +1,6 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-[ $MACHINE = aarch64 ] || skip
-
 cat <<EOF | $CC -c -o $t/a.o -fPIC -xc -
 #include <stdio.h>
 
diff --git a/test/elf/aarch64_variant-pcs.sh b/test/arch-aarch64-variant-pcs.sh
similarity index 100%
rename from test/elf/aarch64_variant-pcs.sh
rename to test/arch-aarch64-variant-pcs.sh
diff --git a/test/elf/arm_abs-error.sh b/test/arch-arm-abs-error.sh
similarity index 86%
rename from test/elf/arm_abs-error.sh
rename to test/arch-arm-abs-error.sh
index fbc57d55..3a79c43c 100755
--- a/test/elf/arm_abs-error.sh
+++ b/test/arch-arm-abs-error.sh
@@ -12,5 +12,7 @@ extern char foo;
 int main() { printf("foo=%p\n", &foo); }
 EOF
 
+$CC -o $t/exe -pie $t/a.o $t/b.o >& /dev/null && skip
+
 ! $CC -B. -o $t/exe -pie $t/a.o $t/b.o >& $t/log
 grep -q 'recompile with -fPIC' $t/log
diff --git a/test/elf/arm_range-extension-thunk-disassembly.sh b/test/arch-arm-range-extension-thunk-disassembly.sh
similarity index 100%
rename from test/elf/arm_range-extension-thunk-disassembly.sh
rename to test/arch-arm-range-extension-thunk-disassembly.sh
diff --git a/test/elf/arm_range-extension-thunk.sh b/test/arch-arm-range-extension-thunk.sh
similarity index 100%
rename from test/elf/arm_range-extension-thunk.sh
rename to test/arch-arm-range-extension-thunk.sh
diff --git a/test/elf/arm_thumb-interwork.sh b/test/arch-arm-thumb-interwork.sh
similarity index 100%
rename from test/elf/arm_thumb-interwork.sh
rename to test/arch-arm-thumb-interwork.sh
diff --git a/test/elf/arm_tlsdesc.sh b/test/arch-arm-tlsdesc.sh
similarity index 100%
rename from test/elf/arm_tlsdesc.sh
rename to test/arch-arm-tlsdesc.sh
diff --git a/test/elf/i686_tls-module-base.sh b/test/arch-i686-tls-module-base.sh
similarity index 100%
rename from test/elf/i686_tls-module-base.sh
rename to test/arch-i686-tls-module-base.sh
diff --git a/test/elf/i686_tlsdesc.sh b/test/arch-i686-tlsdesc.sh
similarity index 100%
rename from test/elf/i686_tlsdesc.sh
rename to test/arch-i686-tlsdesc.sh
diff --git a/test/elf/loongarch64_mcmodel-extreme.sh b/test/arch-loongarch64-mcmodel-extreme.sh
similarity index 100%
rename from test/elf/loongarch64_mcmodel-extreme.sh
rename to test/arch-loongarch64-mcmodel-extreme.sh
diff --git a/test/arch-loongarch64-relax-call36.sh b/test/arch-loongarch64-relax-call36.sh
new file mode 100755
index 00000000..34e40982
--- /dev/null
+++ b/test/arch-loongarch64-relax-call36.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<'EOF' | $CC -o $t/a.o -c -xassembler -
+.globl foo, bar
+.space 0x100000
+foo:
+  move      $s0,   $ra
+  .reloc ., R_LARCH_CALL36, print
+  .reloc ., R_LARCH_RELAX
+  pcaddu18i $t0,   0
+  jirl      $ra,   $t0, 0
+  move      $ra,   $s0
+  ret
+bar:
+  .reloc ., R_LARCH_CALL36, print
+  .reloc ., R_LARCH_RELAX
+  pcaddu18i $t0,   0
+  jirl      $zero, $t0, 0
+.space 0x100000
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xc -
+#include <stdio.h>
+
+void foo();
+void bar();
+
+void print() {
+  printf("foo");
+}
+
+int main() {
+  foo();
+  bar();
+  printf("\n");
+}
+EOF
+
+$CC -B. -o $t/exe1 $t/a.o $t/b.o -Wl,--no-relax
+$QEMU $t/exe1 | grep -q foofoo
+
+$OBJDUMP -d $t/exe1 > $t/exe1.objdump
+grep -A2 '<foo>:' $t/exe1.objdump | grep -wq pcaddu18i
+grep -A2 '<bar>:' $t/exe1.objdump | grep -wq pcaddu18i
+
+$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--relax
+$QEMU $t/exe2 | grep -q foofoo
+
+$OBJDUMP -d $t/exe2 > $t/exe2.objdump
+grep -A2 '<foo>:' $t/exe2.objdump | grep -wq bl
+grep -A2 '<bar>:' $t/exe2.objdump | grep -wq b
diff --git a/test/arch-loongarch64-relax-got-load.sh b/test/arch-loongarch64-relax-got-load.sh
new file mode 100755
index 00000000..279fa8b5
--- /dev/null
+++ b/test/arch-loongarch64-relax-got-load.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -fPIC
+int foo = 3;
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xc - -fPIC -O
+extern int foo;
+int get_foo() { return foo; }
+EOF
+
+cat <<EOF | $CC -o $t/c.o -c -xc - -fPIC
+#include <stdio.h>
+int get_foo();
+int main() { printf("%d\n", get_foo()); }
+EOF
+
+$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -pie -Wl,--no-relax
+$QEMU $t/exe1 | grep -q '^3$'
+$OBJDUMP -d $t/exe1 | grep -A2 '<get_foo>:' | grep -Fqw pcalau12i
+$OBJDUMP -d $t/exe1 | grep -A2 '<get_foo>:' | grep -Fqw ld.d
+
+$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -pie -Wl,--relax
+$QEMU $t/exe2 | grep -q '^3$'
+$OBJDUMP -d $t/exe2 | grep -A1 '<get_foo>:' | grep -Fqw pcaddi
+
+$CC -B. -o $t/exe3 $t/a.o $t/b.o $t/c.o -pie -Wl,--relax \
+  -Wl,-Ttext=0x1000000,-Tdata=0x2000000
+
+$QEMU $t/exe3 | grep -q '^3$'
+$OBJDUMP -d $t/exe3 | grep -A2 '<get_foo>:' | grep -Fqw pcalau12i
+$OBJDUMP -d $t/exe3 | grep -A2 '<get_foo>:' | grep -Fqw addi.d
diff --git a/test/arch-loongarch64-relax-pcala-addi.sh b/test/arch-loongarch64-relax-pcala-addi.sh
new file mode 100755
index 00000000..fe26c73c
--- /dev/null
+++ b/test/arch-loongarch64-relax-pcala-addi.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<'EOF' | $CC -o $t/a.o -c -xassembler -
+.globl get_sym1, get_sym2, get_sym3
+get_sym1:
+  la.pcrel $a0, sym1
+  ret
+get_sym2:
+  la.pcrel $a0, sym2
+  ret
+get_sym3:
+  la.pcrel $a0, sym3
+  ret
+EOF
+
+cat <<'EOF' | $CC -o $t/b.o -c -xassembler -
+.globl sym1, sym2, sym3
+sym1:
+  li.d $a0, 1
+  ret
+.space 1024 * 1024
+sym2:
+  li.d $a0, 2
+  ret
+.space 1024 * 1024
+sym3:
+  li.d $a0, 3
+  ret
+EOF
+
+cat <<EOF | $CC -o $t/c.o -c -xc -
+#include <stdio.h>
+
+int (*get_sym1())();
+int (*get_sym2())();
+int (*get_sym3())();
+
+int main() {
+  printf("%d %d %d\n", get_sym1()(), get_sym2()(), get_sym3()());
+}
+EOF
+
+$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax
+$QEMU $t/exe1 | grep -q '^1 2 3$'
+
+$OBJDUMP -d $t/exe1 > $t/exe1.objdump
+grep -A1 '<get_sym1>:' $t/exe1.objdump | grep -q pcalau12i
+grep -A1 '<get_sym2>:' $t/exe1.objdump | grep -q pcalau12i
+grep -A1 '<get_sym3>:' $t/exe1.objdump | grep -q pcalau12i
+
+$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,--relax
+$QEMU $t/exe2 | grep -q '^1 2 3$'
+
+$OBJDUMP -d $t/exe2 > $t/exe2.objdump
+grep -A1 '<get_sym1>:' $t/exe2.objdump | grep -q pcaddi
+grep -A1 '<get_sym2>:' $t/exe2.objdump | grep -q pcaddi
+grep -A1 '<get_sym3>:' $t/exe2.objdump | grep -q pcalau12i
diff --git a/test/arch-loongarch64-relax-tlsdesc.sh b/test/arch-loongarch64-relax-tlsdesc.sh
new file mode 100755
index 00000000..37b44715
--- /dev/null
+++ b/test/arch-loongarch64-relax-tlsdesc.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<'EOF' | $CC -o $t/a.o -c -xc - -fPIC
+_Thread_local char foo[4] = "foo";
+_Thread_local char padding[100000] = "padding";
+EOF
+
+cat <<'EOF' | $CC -o $t/b.o -c -xc - -fPIC
+_Thread_local char bar[4] = "bar";
+EOF
+
+cat <<'EOF' | $CC -o $t/c.o -c -xc - -fPIC -mtls-dialect=desc -O2
+extern _Thread_local char foo[4];
+extern _Thread_local char bar[4];
+
+char *get_foo() { return foo; }
+char *get_bar() { return bar; }
+EOF
+
+cat <<EOF | $CC -o $t/d.o -c -xc - -mtls-dialect=desc
+#include <stdio.h>
+char *get_foo();
+char *get_bar();
+
+int main() {
+  printf("%s %s\n", get_foo(), get_bar());
+}
+EOF
+
+$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--no-relax
+$QEMU $t/exe1 | grep -q 'foo bar'
+
+$OBJDUMP -d $t/exe1 > $t/exe1.objdump
+grep -A6 '<get_foo>:' $t/exe1.objdump | grep -Fq pcalau12i
+grep -A6 '<get_bar>:' $t/exe1.objdump | grep -Fq pcalau12i
+
+$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--relax
+$QEMU $t/exe2 | grep -q 'foo bar'
+
+$OBJDUMP -d $t/exe2 > $t/exe2.objdump
+grep -A6 '<get_foo>:' $t/exe2.objdump | grep -Fq li.w
+grep -A6 '<get_bar>:' $t/exe2.objdump | grep -Fq lu12i.w
diff --git a/test/elf/ppc64le_save_restore_gprs.sh b/test/arch-ppc64le-save-restore-gprs.sh
similarity index 100%
rename from test/elf/ppc64le_save_restore_gprs.sh
rename to test/arch-ppc64le-save-restore-gprs.sh
diff --git a/test/elf/riscv64_attributes.sh b/test/arch-riscv64-attributes.sh
similarity index 100%
rename from test/elf/riscv64_attributes.sh
rename to test/arch-riscv64-attributes.sh
diff --git a/test/elf/riscv64_attributes2.sh b/test/arch-riscv64-attributes2.sh
similarity index 100%
rename from test/elf/riscv64_attributes2.sh
rename to test/arch-riscv64-attributes2.sh
diff --git a/test/elf/riscv64_global-pointer-dso.sh b/test/arch-riscv64-global-pointer-dso.sh
similarity index 100%
rename from test/elf/riscv64_global-pointer-dso.sh
rename to test/arch-riscv64-global-pointer-dso.sh
diff --git a/test/elf/riscv64_global-pointer.sh b/test/arch-riscv64-global-pointer.sh
similarity index 100%
rename from test/elf/riscv64_global-pointer.sh
rename to test/arch-riscv64-global-pointer.sh
diff --git a/test/elf/riscv64_norvc.sh b/test/arch-riscv64-norvc.sh
similarity index 100%
rename from test/elf/riscv64_norvc.sh
rename to test/arch-riscv64-norvc.sh
diff --git a/test/elf/riscv64_obj-compatible.sh b/test/arch-riscv64-obj-compatible.sh
similarity index 100%
rename from test/elf/riscv64_obj-compatible.sh
rename to test/arch-riscv64-obj-compatible.sh
diff --git a/test/elf/riscv64_relax-got.sh b/test/arch-riscv64-relax-got.sh
similarity index 100%
rename from test/elf/riscv64_relax-got.sh
rename to test/arch-riscv64-relax-got.sh
diff --git a/test/elf/riscv64_relax-hi20.sh b/test/arch-riscv64-relax-hi20.sh
similarity index 92%
rename from test/elf/riscv64_relax-hi20.sh
rename to test/arch-riscv64-relax-hi20.sh
index a0befcda..fb4774eb 100755
--- a/test/elf/riscv64_relax-hi20.sh
+++ b/test/arch-riscv64-relax-hi20.sh
@@ -46,3 +46,5 @@ $QEMU $t/exe1 | grep -q 'f00 10000f00 ba 11beef'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o
 $QEMU $t/exe2 | grep -q 'f00 10000f00 ba 11beef'
+
+[ $(stat --format='%s' $t/exe1) -gt $(stat --format='%s' $t/exe2) ]
diff --git a/test/elf/riscv64_weak-undef.sh b/test/arch-riscv64-weak-undef.sh
similarity index 100%
rename from test/elf/riscv64_weak-undef.sh
rename to test/arch-riscv64-weak-undef.sh
diff --git a/test/elf/s390x_got.sh b/test/arch-s390x-got.sh
similarity index 76%
rename from test/elf/s390x_got.sh
rename to test/arch-s390x-got.sh
index 60234e42..ac061d4f 100755
--- a/test/elf/s390x_got.sh
+++ b/test/arch-s390x-got.sh
@@ -10,9 +10,9 @@ extern char _DYNAMIC;
 extern void *got[];
 
 int main() {
-  printf("%p %p\n", &_DYNAMIC, got[0]);
+  printf("%d %p %p\n", &_DYNAMIC == got[0], &_DYNAMIC, got[0]);
 }
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-defsym=got=_GLOBAL_OFFSET_TABLE_ -no-pie
-$QEMU $t/exe | grep -Eq '^(\S+) \1$'
+$QEMU $t/exe | grep -Eq '^1'
diff --git a/test/arch-x86_64-address-equality.sh b/test/arch-x86_64-address-equality.sh
new file mode 100755
index 00000000..ccdf7528
--- /dev/null
+++ b/test/arch-x86_64-address-equality.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xassembler -
+.globl foo
+foo:
+  lea bar(%rip), %rax
+  ret
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xc - -fPIC
+#include <stdio.h>
+void *foo();
+void bar();
+int main() { printf("%d %p %p\n", foo() == bar, foo(), bar); }
+EOF
+
+cat <<EOF | $CC -o $t/c.o -c -xc - -fPIC
+void bar() {}
+EOF
+
+$CC -B. -shared -o $t/d.so $t/c.o
+
+$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/d.so -pie
+$QEMU $t/exe1 | grep -q '^1 '
+
+$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/d.so -pie -Wl,-no-relax
+$QEMU $t/exe2 | grep -q '^1 '
diff --git a/test/elf/x86_64_empty-mergeable-section.sh b/test/arch-x86_64-empty-mergeable-section.sh
similarity index 100%
rename from test/elf/x86_64_empty-mergeable-section.sh
rename to test/arch-x86_64-empty-mergeable-section.sh
diff --git a/test/elf/x86_64_emulation-deduction.sh b/test/arch-x86_64-emulation-deduction.sh
similarity index 100%
rename from test/elf/x86_64_emulation-deduction.sh
rename to test/arch-x86_64-emulation-deduction.sh
diff --git a/test/elf/x86_64_exception-mcmodel-large.sh b/test/arch-x86_64-exception-mcmodel-large.sh
similarity index 100%
rename from test/elf/x86_64_exception-mcmodel-large.sh
rename to test/arch-x86_64-exception-mcmodel-large.sh
diff --git a/test/elf/x86_64_execstack-if-needed.sh b/test/arch-x86_64-execstack-if-needed.sh
similarity index 100%
rename from test/elf/x86_64_execstack-if-needed.sh
rename to test/arch-x86_64-execstack-if-needed.sh
diff --git a/test/elf/x86_64_gnu-linkonce.sh b/test/arch-x86_64-gnu-linkonce.sh
similarity index 100%
rename from test/elf/x86_64_gnu-linkonce.sh
rename to test/arch-x86_64-gnu-linkonce.sh
diff --git a/test/elf/x86_64_gnu-retain.sh b/test/arch-x86_64-gnu-retain.sh
similarity index 100%
rename from test/elf/x86_64_gnu-retain.sh
rename to test/arch-x86_64-gnu-retain.sh
diff --git a/test/elf/x86_64_gotpcrelx.sh b/test/arch-x86_64-gotpcrelx.sh
similarity index 100%
rename from test/elf/x86_64_gotpcrelx.sh
rename to test/arch-x86_64-gotpcrelx.sh
diff --git a/test/elf/x86_64_ifunc-alias.sh b/test/arch-x86_64-ifunc-alias.sh
similarity index 100%
rename from test/elf/x86_64_ifunc-alias.sh
rename to test/arch-x86_64-ifunc-alias.sh
diff --git a/test/elf/x86_64_incompatible-libs-linker-script.sh b/test/arch-x86_64-incompatible-libs-linker-script.sh
similarity index 100%
rename from test/elf/x86_64_incompatible-libs-linker-script.sh
rename to test/arch-x86_64-incompatible-libs-linker-script.sh
diff --git a/test/elf/x86_64_incompatible-libs-linker-script2.sh b/test/arch-x86_64-incompatible-libs-linker-script2.sh
similarity index 92%
rename from test/elf/x86_64_incompatible-libs-linker-script2.sh
rename to test/arch-x86_64-incompatible-libs-linker-script2.sh
index 46be3af2..3630692c 100755
--- a/test/elf/x86_64_incompatible-libs-linker-script2.sh
+++ b/test/arch-x86_64-incompatible-libs-linker-script2.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
+nm mold | grep -q '__tsan_init' && skip
 echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null || skip
 
 mkdir -p $t/foo
diff --git a/test/elf/x86_64_incompatible-libs.sh b/test/arch-x86_64-incompatible-libs.sh
similarity index 100%
rename from test/elf/x86_64_incompatible-libs.sh
rename to test/arch-x86_64-incompatible-libs.sh
diff --git a/test/elf/x86_64_incompatible-libs2.sh b/test/arch-x86_64-incompatible-libs2.sh
similarity index 100%
rename from test/elf/x86_64_incompatible-libs2.sh
rename to test/arch-x86_64-incompatible-libs2.sh
diff --git a/test/elf/x86_64_incompatible-obj.sh b/test/arch-x86_64-incompatible-obj.sh
similarity index 100%
rename from test/elf/x86_64_incompatible-obj.sh
rename to test/arch-x86_64-incompatible-obj.sh
diff --git a/test/elf/x86_64_init-array-readonly.sh b/test/arch-x86_64-init-array-readonly.sh
similarity index 100%
rename from test/elf/x86_64_init-array-readonly.sh
rename to test/arch-x86_64-init-array-readonly.sh
diff --git a/test/elf/x86_64_init-array.sh b/test/arch-x86_64-init-array.sh
similarity index 100%
rename from test/elf/x86_64_init-array.sh
rename to test/arch-x86_64-init-array.sh
diff --git a/test/arch-x86_64-isa-level.sh b/test/arch-x86_64-isa-level.sh
new file mode 100755
index 00000000..d51afd8d
--- /dev/null
+++ b/test/arch-x86_64-isa-level.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+int main() {}
+EOF
+
+$CC -B. -o $t/exe2 $t/a.o -Wl,-z,x86-64-v2
+readelf -n $t/exe2 | grep -Fq 'Unknown note type: (0x00000005)' && skip
+readelf -n $t/exe2 | grep -Fq 'procesor-specific type 0xc0008002' && skip
+readelf -n $t/exe2 | grep -q 'x86 ISA needed: .*x86-64-v2'
+
+$CC -B. -o $t/exe3 $t/a.o -Wl,-z,x86-64-v3
+readelf -n $t/exe3 | grep -q 'x86 ISA needed: .*x86-64-v3'
+
+$CC -B. -o $t/exe4 $t/a.o -Wl,-z,x86-64-v4
+readelf -n $t/exe4 | grep -q 'x86 ISA needed: .*x86-64-v4'
diff --git a/test/elf/x86_64_large-bss.sh b/test/arch-x86_64-large-bss.sh
similarity index 100%
rename from test/elf/x86_64_large-bss.sh
rename to test/arch-x86_64-large-bss.sh
diff --git a/test/elf/x86_64_mergeable-records.sh b/test/arch-x86_64-mergeable-records.sh
similarity index 100%
rename from test/elf/x86_64_mergeable-records.sh
rename to test/arch-x86_64-mergeable-records.sh
diff --git a/test/arch-x86_64-mergeable-strings-nonalloc.sh b/test/arch-x86_64-mergeable-strings-nonalloc.sh
new file mode 100755
index 00000000..3d817d36
--- /dev/null
+++ b/test/arch-x86_64-mergeable-strings-nonalloc.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<'EOF' | $CC -o $t/a.o -c -xc -
+int main() {}
+EOF
+
+cat <<'EOF' | $CC -o $t/b.o -c -x assembler -
+.section .foo, "", @progbits
+.quad .L1 - 1
+.quad .L2 - 1
+
+.section .bar, "MS", @progbits, 1
+.L1:
+  .string "abc"
+.L2:
+  .string "xyz"
+EOF
+
+$CC -B. -o $t/exe $t/a.o $t/b.o
+
+readelf -x .foo $t/exe | grep -Fq '03000000 00000000 ffffffff ffffffff'
+readelf -x .bar $t/exe | grep -Fq 'xyz.abc.'
diff --git a/test/elf/x86_64_mergeable-strings.sh b/test/arch-x86_64-mergeable-strings.sh
similarity index 100%
rename from test/elf/x86_64_mergeable-strings.sh
rename to test/arch-x86_64-mergeable-strings.sh
diff --git a/test/elf/x86_64_note-property.sh b/test/arch-x86_64-note-property.sh
similarity index 100%
rename from test/elf/x86_64_note-property.sh
rename to test/arch-x86_64-note-property.sh
diff --git a/test/elf/x86_64_note-property2.sh b/test/arch-x86_64-note-property2.sh
similarity index 100%
rename from test/elf/x86_64_note-property2.sh
rename to test/arch-x86_64-note-property2.sh
diff --git a/test/elf/x86_64_note.sh b/test/arch-x86_64-note.sh
similarity index 94%
rename from test/elf/x86_64_note.sh
rename to test/arch-x86_64-note.sh
index cff814b4..51aa68d4 100755
--- a/test/elf/x86_64_note.sh
+++ b/test/arch-x86_64-note.sh
@@ -37,5 +37,5 @@ grep -Eq '.note.baz\s+NOTE.+000008 00   A  0   0  8' $t/log
 grep -Eq '.note.nonalloc\s+NOTE.+000008 00      0   0  1' $t/log
 
 readelf --segments $t/exe > $t/log
-grep -Fq '01     .note.baz .note.foo .note.bar' $t/log
+grep -Fq '01     .note.bar .note.baz .note.foo' $t/log
 ! grep -q 'NOTE.*0x0000000000000000 0x0000000000000000' $t/log || false
diff --git a/test/elf/x86_64_note2.sh b/test/arch-x86_64-note2.sh
similarity index 93%
rename from test/elf/x86_64_note2.sh
rename to test/arch-x86_64-note2.sh
index 24ebef58..e2bb3036 100755
--- a/test/elf/x86_64_note2.sh
+++ b/test/arch-x86_64-note2.sh
@@ -29,4 +29,4 @@ EOF
 ./mold -o $t/exe $t/a.o $t/b.o $t/c.o $t/d.o
 
 readelf --segments $t/exe > $t/log
-grep -Fq '01     .note.a .note.c .note.b' $t/log
+grep -Fq '01     .note.a .note.b .note.c' $t/log
diff --git a/test/elf/x86_64_plt.sh b/test/arch-x86_64-plt.sh
similarity index 100%
rename from test/elf/x86_64_plt.sh
rename to test/arch-x86_64-plt.sh
diff --git a/test/elf/x86_64_preinit-array.sh b/test/arch-x86_64-preinit-array.sh
similarity index 100%
rename from test/elf/x86_64_preinit-array.sh
rename to test/arch-x86_64-preinit-array.sh
diff --git a/test/elf/x86_64_relax.sh b/test/arch-x86_64-relax.sh
similarity index 100%
rename from test/elf/x86_64_relax.sh
rename to test/arch-x86_64-relax.sh
diff --git a/test/elf/x86_64_reloc-overflow.sh b/test/arch-x86_64-reloc-overflow.sh
similarity index 100%
rename from test/elf/x86_64_reloc-overflow.sh
rename to test/arch-x86_64-reloc-overflow.sh
diff --git a/test/elf/x86_64_reloc-zero.sh b/test/arch-x86_64-reloc-zero.sh
similarity index 100%
rename from test/elf/x86_64_reloc-zero.sh
rename to test/arch-x86_64-reloc-zero.sh
diff --git a/test/elf/x86_64_reloc.sh b/test/arch-x86_64-reloc.sh
similarity index 100%
rename from test/elf/x86_64_reloc.sh
rename to test/arch-x86_64-reloc.sh
diff --git a/test/elf/x86_64_section-alignment.sh b/test/arch-x86_64-section-alignment.sh
similarity index 100%
rename from test/elf/x86_64_section-alignment.sh
rename to test/arch-x86_64-section-alignment.sh
diff --git a/test/elf/x86_64_section-name.sh b/test/arch-x86_64-section-name.sh
similarity index 100%
rename from test/elf/x86_64_section-name.sh
rename to test/arch-x86_64-section-name.sh
diff --git a/test/arch-x86_64-tbss-only.sh b/test/arch-x86_64-tbss-only.sh
new file mode 100755
index 00000000..6ebdb453
--- /dev/null
+++ b/test/arch-x86_64-tbss-only.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+# Test if grep supports backreferences
+echo abab | grep -Eq '(ab)\1' || skip
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+__thread char foo;
+
+__attribute__((section(".data.rel.ro.bar"), aligned(16*1024)))
+char bar;
+
+int main() {}
+EOF
+
+$CC -B. -o $t/exe $t/a.o
+$QEMU $t/exe
+
+readelf -W --segments $t/exe | grep -Eq 'TLS +0x000([^ ][^ ][^ ]) 0x[^ ]+\1 '
diff --git a/test/elf/x86_64_tls-gd-mcmodel-large.sh b/test/arch-x86_64-tls-gd-mcmodel-large.sh
similarity index 100%
rename from test/elf/x86_64_tls-gd-mcmodel-large.sh
rename to test/arch-x86_64-tls-gd-mcmodel-large.sh
diff --git a/test/elf/x86_64_tls-gd-to-ie.sh b/test/arch-x86_64-tls-gd-to-ie.sh
similarity index 100%
rename from test/elf/x86_64_tls-gd-to-ie.sh
rename to test/arch-x86_64-tls-gd-to-ie.sh
diff --git a/test/elf/x86_64_tls-large-tbss.sh b/test/arch-x86_64-tls-large-tbss.sh
similarity index 100%
rename from test/elf/x86_64_tls-large-tbss.sh
rename to test/arch-x86_64-tls-large-tbss.sh
diff --git a/test/elf/x86_64_tls-ld-mcmodel-large.sh b/test/arch-x86_64-tls-ld-mcmodel-large.sh
similarity index 100%
rename from test/elf/x86_64_tls-ld-mcmodel-large.sh
rename to test/arch-x86_64-tls-ld-mcmodel-large.sh
diff --git a/test/elf/x86_64_tls-module-base.sh b/test/arch-x86_64-tls-module-base.sh
similarity index 96%
rename from test/elf/x86_64_tls-module-base.sh
rename to test/arch-x86_64-tls-module-base.sh
index 211d2a5a..830f0297 100755
--- a/test/elf/x86_64_tls-module-base.sh
+++ b/test/arch-x86_64-tls-module-base.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
+supports_tlsdesc || skip
+
 cat <<EOF | $CC -fPIC -o $t/a.o -c -xassembler -
 .globl get_foo
 .type get_foo, @function
diff --git a/test/elf/x86_64_tlsdesc.sh b/test/arch-x86_64-tlsdesc.sh
similarity index 100%
rename from test/elf/x86_64_tlsdesc.sh
rename to test/arch-x86_64-tlsdesc.sh
diff --git a/test/elf/x86_64_unique.sh b/test/arch-x86_64-unique.sh
similarity index 100%
rename from test/elf/x86_64_unique.sh
rename to test/arch-x86_64-unique.sh
diff --git a/test/elf/x86_64_warn-execstack.sh b/test/arch-x86_64-warn-execstack.sh
similarity index 61%
rename from test/elf/x86_64_warn-execstack.sh
rename to test/arch-x86_64-warn-execstack.sh
index f4cbfda1..aaf6c244 100755
--- a/test/elf/x86_64_warn-execstack.sh
+++ b/test/arch-x86_64-warn-execstack.sh
@@ -9,4 +9,4 @@ cat <<EOF | $CC -o $t/b.o -c -xc -
 int main() {}
 EOF
 
-$GCC -B. -o $t/exe $t/a.o $t/b.o 2>&1 | grep -q 'may cause a segmentation fault'
+$GCC -B. -o $t/exe $t/a.o $t/b.o 2>&1 | grep -Eq 'may cause a segmentation fault|requires executable stack'
diff --git a/test/elf/x86_64_warn-shared-textrel.sh b/test/arch-x86_64-warn-shared-textrel.sh
similarity index 100%
rename from test/elf/x86_64_warn-shared-textrel.sh
rename to test/arch-x86_64-warn-shared-textrel.sh
diff --git a/test/elf/x86_64_warn-textrel.sh b/test/arch-x86_64-warn-textrel.sh
similarity index 100%
rename from test/elf/x86_64_warn-textrel.sh
rename to test/arch-x86_64-warn-textrel.sh
diff --git a/test/elf/x86_64_z-ibt.sh b/test/arch-x86_64-z-ibt.sh
similarity index 100%
rename from test/elf/x86_64_z-ibt.sh
rename to test/arch-x86_64-z-ibt.sh
diff --git a/test/elf/x86_64_z-ibtplt.sh b/test/arch-x86_64-z-ibtplt.sh
similarity index 100%
rename from test/elf/x86_64_z-ibtplt.sh
rename to test/arch-x86_64-z-ibtplt.sh
diff --git a/test/elf/x86_64_endbr.sh b/test/arch-x86_64-z-rewrite-endbr.sh
similarity index 100%
rename from test/elf/x86_64_endbr.sh
rename to test/arch-x86_64-z-rewrite-endbr.sh
diff --git a/test/elf/x86_64_endbr2.sh b/test/arch-x86_64-z-rewrite-endbr2.sh
similarity index 100%
rename from test/elf/x86_64_endbr2.sh
rename to test/arch-x86_64-z-rewrite-endbr2.sh
diff --git a/test/arch-x86_64-z-rewrite-endbr3.sh b/test/arch-x86_64-z-rewrite-endbr3.sh
new file mode 100755
index 00000000..f8358542
--- /dev/null
+++ b/test/arch-x86_64-z-rewrite-endbr3.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+test_cflags -fcf-protection || skip
+[ "$QEMU" == '' ] || skip
+
+# Check if Intel SDE CPU emulator is available
+command -v sde >& /dev/null || skip
+sde --help | grep -q 'Software Development Emulator' || skip
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -O -fcf-protection
+#include <stdio.h>
+int main() {
+  printf("Hello world\n");
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o -Wl,-z,rewrite-endbr
+sde -cet 1 -- $t/exe | grep -q 'Hello world'
diff --git a/test/elf/x86_64_z-shstk.sh b/test/arch-x86_64-z-shstk.sh
similarity index 100%
rename from test/elf/x86_64_z-shstk.sh
rename to test/arch-x86_64-z-shstk.sh
diff --git a/test/elf/x86_64_z-text.sh b/test/arch-x86_64-z-text.sh
similarity index 100%
rename from test/elf/x86_64_z-text.sh
rename to test/arch-x86_64-z-text.sh
diff --git a/test/elf/as-needed-dso.sh b/test/as-needed-dso.sh
similarity index 93%
rename from test/elf/as-needed-dso.sh
rename to test/as-needed-dso.sh
index 40f0a46b..60fd6bd6 100755
--- a/test/elf/as-needed-dso.sh
+++ b/test/as-needed-dso.sh
@@ -18,4 +18,4 @@ EOF
 $CC -B. -o $t/exe $t/a.o -L$t -Wl,--as-needed -lbar -lfoo
 readelf -W --dynamic $t/exe > $t/log2
 grep -q libbar $t/log2
-! grep -q libfoo $t/log2 || false
+grep -q libfoo $t/log2
diff --git a/test/elf/as-needed-dso2.sh b/test/as-needed-dso2.sh
similarity index 100%
rename from test/elf/as-needed-dso2.sh
rename to test/as-needed-dso2.sh
diff --git a/test/elf/as-needed-weak.sh b/test/as-needed-weak.sh
similarity index 77%
rename from test/elf/as-needed-weak.sh
rename to test/as-needed-weak.sh
index 112561fc..fc432300 100755
--- a/test/elf/as-needed-weak.sh
+++ b/test/as-needed-weak.sh
@@ -18,14 +18,14 @@ cat <<EOF | $CC -o $t/libbar.so -shared -fPIC -Wl,-soname,libbar.so -xc -
 int fn2() { return 42; }
 EOF
 
-$CC -o $t/exe1 $t/a.o -Wl,-no-as-needed -L$t -lbar -lfoo
+$CC -B. -o $t/exe1 $t/a.o -Wl,-no-as-needed -L$t -lbar -lfoo
 
 readelf --dynamic $t/exe1 > $t/log1
 grep -Fq 'Shared library: [libfoo.so]' $t/log1
 grep -Fq 'Shared library: [libbar.so]' $t/log1
 
-$CC -o $t/exe2 $t/a.o -Wl,-as-needed -L$t -lbar -lfoo
+$CC -B. -o $t/exe2 $t/a.o -Wl,-as-needed -L$t -lbar -lfoo
 
 readelf --dynamic $t/exe2 > $t/log2
-! grep -Fq 'Shared library: [libfoo.so]' $t/log2 || false
+grep -Fq 'Shared library: [libfoo.so]' $t/log2
 ! grep -Fq 'Shared library: [libbar.so]' $t/log2 || false
diff --git a/test/elf/as-needed.sh b/test/as-needed.sh
similarity index 60%
rename from test/elf/as-needed.sh
rename to test/as-needed.sh
index b0389c27..6d5448c8 100755
--- a/test/elf/as-needed.sh
+++ b/test/as-needed.sh
@@ -18,12 +18,12 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,--no-as-needed $t/b.so $t/c.so
 
-readelf --dynamic $t/exe > $t/readelf
-grep -Fq 'Shared library: [libfoo.so]' $t/readelf
-grep -Fq 'Shared library: [libbar.so]' $t/readelf
+readelf --dynamic $t/exe > $t/log
+grep -Fq 'Shared library: [libfoo.so]' $t/log
+grep -Fq 'Shared library: [libbar.so]' $t/log
 
 $CC -B. -o $t/exe $t/a.o -Wl,--as-needed $t/b.so $t/c.so
 
-readelf --dynamic $t/exe > $t/readelf
-grep -Fq 'Shared library: [libfoo.so]' $t/readelf
-! grep -Fq 'Shared library: [libbar.so]' $t/readelf || false
+readelf --dynamic $t/exe > $t/log
+grep -Fq 'Shared library: [libfoo.so]' $t/log
+! grep -Fq 'Shared library: [libbar.so]' $t/log || false
diff --git a/test/elf/auxiliary.sh b/test/auxiliary.sh
similarity index 100%
rename from test/elf/auxiliary.sh
rename to test/auxiliary.sh
diff --git a/test/elf/bno-symbolic.sh b/test/bno-symbolic.sh
similarity index 100%
rename from test/elf/bno-symbolic.sh
rename to test/bno-symbolic.sh
diff --git a/test/elf/bsymbolic-functions.sh b/test/bsymbolic-functions.sh
similarity index 100%
rename from test/elf/bsymbolic-functions.sh
rename to test/bsymbolic-functions.sh
diff --git a/test/elf/bsymbolic-non-weak-functions.sh b/test/bsymbolic-non-weak-functions.sh
similarity index 100%
rename from test/elf/bsymbolic-non-weak-functions.sh
rename to test/bsymbolic-non-weak-functions.sh
diff --git a/test/elf/bsymbolic-non-weak.sh b/test/bsymbolic-non-weak.sh
similarity index 100%
rename from test/elf/bsymbolic-non-weak.sh
rename to test/bsymbolic-non-weak.sh
diff --git a/test/elf/bsymbolic.sh b/test/bsymbolic.sh
similarity index 100%
rename from test/elf/bsymbolic.sh
rename to test/bsymbolic.sh
diff --git a/test/elf/build-id.sh b/test/build-id.sh
similarity index 87%
rename from test/elf/build-id.sh
rename to test/build-id.sh
index acff861e..d2310925 100755
--- a/test/elf/build-id.sh
+++ b/test/build-id.sh
@@ -18,5 +18,8 @@ readelf -n $t/exe | grep -q 'GNU.*0x00000014.*NT_GNU_BUILD_ID'
 $CC -B. -o $t/exe $t/a.c -Wl,-build-id=sha256
 readelf -n $t/exe | grep -q 'GNU.*0x00000020.*NT_GNU_BUILD_ID'
 
+$CC -B. -o $t/exe $t/a.c -Wl,-build-id=fast
+readelf -n $t/exe | grep -q 'GNU.*0x00000020.*NT_GNU_BUILD_ID'
+
 $CC -B. -o $t/exe $t/a.c -Wl,-build-id=0xdeadbeefdeadbeef
 readelf -n $t/exe | grep -q 'Build ID: deadbeefdeadbeef'
diff --git a/test/elf/canonical-plt.sh b/test/canonical-plt.sh
similarity index 100%
rename from test/elf/canonical-plt.sh
rename to test/canonical-plt.sh
diff --git a/test/elf/cmdline.sh b/test/cmdline.sh
similarity index 100%
rename from test/elf/cmdline.sh
rename to test/cmdline.sh
diff --git a/test/elf/color-diagnostics.sh b/test/color-diagnostics.sh
similarity index 100%
rename from test/elf/color-diagnostics.sh
rename to test/color-diagnostics.sh
diff --git a/test/elf/comment.sh b/test/comment.sh
similarity index 100%
rename from test/elf/comment.sh
rename to test/comment.sh
diff --git a/test/elf/common-archive.sh b/test/common-archive.sh
similarity index 100%
rename from test/elf/common-archive.sh
rename to test/common-archive.sh
diff --git a/test/elf/common-ref.sh b/test/common-ref.sh
similarity index 100%
rename from test/elf/common-ref.sh
rename to test/common-ref.sh
diff --git a/test/elf/common.sh b/test/common-symbols.sh
similarity index 100%
rename from test/elf/common.sh
rename to test/common-symbols.sh
diff --git a/test/elf/common.inc b/test/common.inc
similarity index 74%
rename from test/elf/common.inc
rename to test/common.inc
index 5200ff31..fdad9f27 100644
--- a/test/elf/common.inc
+++ b/test/common.inc
@@ -6,6 +6,7 @@ export LC_ALL=C
 canonical_name() {
   case $1 in
   i?86) echo i686 ;;
+  amd64) echo x86_64 ;;
   arm*) echo arm ;;
   powerpc) echo ppc ;;
   powerpc64) echo ppc64 ;;
@@ -20,7 +21,7 @@ fi
 
 # Set tool names
 if [ -z "$TRIPLE" ]; then
-  TESTDIR=out/test/elf/$MACHINE
+  TESTDIR=out/test/$MACHINE
   CC="${TEST_CC:-cc}"
   CXX="${TEST_CXX:-c++}"
   GCC="${TEST_GCC:-gcc}"
@@ -31,7 +32,7 @@ if [ -z "$TRIPLE" ]; then
   QEMU=
 elif [ "$TRIPLE" = powerpc64le-linux-gnu -a "$CPU" = power10 ]; then
   MACHINE=ppc64le
-  TESTDIR=out/test/elf/ppc64le-power10
+  TESTDIR=out/test/ppc64le-power10
   CC="${TEST_CC:-$TRIPLE-gcc} -mcpu=power10"
   CXX="${TEST_CXX:-$TRIPLE-g++} -mcpu=power10"
   GCC="${TEST_GCC:-$TRIPLE-gcc} -mcpu=power10"
@@ -42,7 +43,7 @@ elif [ "$TRIPLE" = powerpc64le-linux-gnu -a "$CPU" = power10 ]; then
   QEMU="qemu-ppc64le -L /usr/$TRIPLE -cpu power10"
 else
   MACHINE=$(canonical_name $(echo $TRIPLE | sed 's/-.*//'))
-  TESTDIR=out/test/elf/$MACHINE
+  TESTDIR=out/test/$MACHINE
   CC="${TEST_CC:-$TRIPLE-gcc}"
   CXX="${TEST_CXX:-$TRIPLE-g++}"
   GCC="${TEST_GCC:-$TRIPLE-gcc}"
@@ -58,19 +59,26 @@ else
   fi
 fi
 
-if [ $MACHINE = x86_64 -o $MACHINE = i686 -o $MACHINE = arm ]; then
-  tlsdesc_opt=-mtls-dialect=gnu2
-elif [ $MACHINE = aarch64 ]; then
-  tlsdesc_opt=-mtls-dialect=desc
+case $MACHINE in
+x86_64 | i686 | arm)
+  tlsdesc_opt=-mtls-dialect=gnu2 ;;
+aarch64 | loongarch*)
+  tlsdesc_opt=-mtls-dialect=desc ;;
+esac
+
+# We want to use GNU's binutils even on BSDs. `pkg install binutils`
+# installs GNU binutils under /usr/local/bin.
+if [ "$(uname)" = FreeBSD ]; then
+  export PATH="/usr/local/bin:$PATH"
 fi
 
 # Common functions
 test_cflags() {
-  echo 'int main() {}' | $CC "$@" -o /dev/null -xc - >& /dev/null
+  echo 'int main() {}' | $CC -B. "$@" -o /dev/null -xc - >& /dev/null
 }
 
 test_cxxflags() {
-  echo 'int main() {}' | $CXX "$@" -o /dev/null -xc++ - >& /dev/null
+  echo 'int main() {}' | $CXX -B. "$@" -o /dev/null -xc++ - >& /dev/null
 }
 
 is_musl() {
@@ -87,7 +95,10 @@ supports_tlsdesc() {
   # musl's tlsdesc on arm32 seems to be broken
   [ $MACHINE = arm ] && is_musl && return 1
 
-  [ -n "$tlsdesc_opt" ]
+  # FreeBSD's loader doesn't seem to support TLSDESC relocs in an executable
+  [ "$(uname)" = FreeBSD ] && return 1
+
+  [ "$tlsdesc_opt" != '' ]
 }
 
 on_qemu() {
@@ -120,3 +131,4 @@ testname=$(basename "$0" .sh)
 echo -n "Testing $testname ... "
 t=$TESTDIR/$testname
 mkdir -p $t
+set -x
diff --git a/test/elf/compress-debug-sections-zstd.sh b/test/compress-debug-sections-zstd.sh
similarity index 100%
rename from test/elf/compress-debug-sections-zstd.sh
rename to test/compress-debug-sections-zstd.sh
diff --git a/test/elf/compress-debug-sections.sh b/test/compress-debug-sections.sh
similarity index 100%
rename from test/elf/compress-debug-sections.sh
rename to test/compress-debug-sections.sh
diff --git a/test/elf/compressed-debug-info.sh b/test/compressed-debug-info.sh
similarity index 100%
rename from test/elf/compressed-debug-info.sh
rename to test/compressed-debug-info.sh
diff --git a/test/elf/copyrel-alignment.sh b/test/copyrel-alignment.sh
similarity index 96%
rename from test/elf/copyrel-alignment.sh
rename to test/copyrel-alignment.sh
index 4b265ac7..432179bd 100755
--- a/test/elf/copyrel-alignment.sh
+++ b/test/copyrel-alignment.sh
@@ -3,7 +3,6 @@
 
 [ $MACHINE = ppc64 ] && skip
 [ $MACHINE = ppc64le ] && skip
-[ $MACHINE = alpha ] && skip
 [[ $MACHINE = loongarch* ]] && skip
 
 cat <<EOF | $CC -fPIC -shared -o $t/a.so -xc -
diff --git a/test/elf/copyrel-norelro.sh b/test/copyrel-norelro.sh
similarity index 95%
rename from test/elf/copyrel-norelro.sh
rename to test/copyrel-norelro.sh
index 6892edff..36b712e0 100755
--- a/test/elf/copyrel-norelro.sh
+++ b/test/copyrel-norelro.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-[ $MACHINE = alpha ] && skip
 [[ $MACHINE = ppc64* ]] && skip
 [[ $MACHINE = loongarch* ]] && skip
 
diff --git a/test/elf/copyrel-protected.sh b/test/copyrel-protected.sh
similarity index 80%
rename from test/elf/copyrel-protected.sh
rename to test/copyrel-protected.sh
index 8c4c0a09..0cd196c4 100755
--- a/test/elf/copyrel-protected.sh
+++ b/test/copyrel-protected.sh
@@ -3,7 +3,6 @@
 
 [ $MACHINE = ppc64 ] && skip
 [ $MACHINE = ppc64le ] && skip
-[ $MACHINE = alpha ] && skip
 [[ $MACHINE = loongarch* ]] && skip
 
 cat <<EOF | $CC -o $t/a.o -c -xc -fno-PIE -
@@ -19,4 +18,4 @@ __attribute__((visibility("protected"))) int foo;
 EOF
 
 ! $CC -B. $t/a.o $t/b.so -o $t/exe >& $t/log -no-pie || false
-grep -Fq 'cannot make copy relocation for protected symbol' $t/log
+grep -Fq 'cannot create a copy relocation for protected symbol' $t/log
diff --git a/test/elf/copyrel-relro.sh b/test/copyrel-relro.sh
similarity index 100%
rename from test/elf/copyrel-relro.sh
rename to test/copyrel-relro.sh
diff --git a/test/elf/copyrel-relro2.sh b/test/copyrel-relro2.sh
similarity index 100%
rename from test/elf/copyrel-relro2.sh
rename to test/copyrel-relro2.sh
diff --git a/test/elf/copyrel.sh b/test/copyrel.sh
similarity index 100%
rename from test/elf/copyrel.sh
rename to test/copyrel.sh
diff --git a/test/elf/ctors-in-init-array.sh b/test/ctors-in-init-array.sh
similarity index 100%
rename from test/elf/ctors-in-init-array.sh
rename to test/ctors-in-init-array.sh
diff --git a/test/elf/dead-debug-sections.sh b/test/dead-debug-sections.sh
similarity index 100%
rename from test/elf/dead-debug-sections.sh
rename to test/dead-debug-sections.sh
diff --git a/test/elf/debug-macro-section.sh b/test/debug-macro-section.sh
similarity index 100%
rename from test/elf/debug-macro-section.sh
rename to test/debug-macro-section.sh
diff --git a/test/elf/default-symver.sh b/test/default-symver.sh
similarity index 100%
rename from test/elf/default-symver.sh
rename to test/default-symver.sh
diff --git a/test/elf/defsym-lto.sh b/test/defsym-lto.sh
similarity index 79%
rename from test/elf/defsym-lto.sh
rename to test/defsym-lto.sh
index 3848384b..d60b83df 100755
--- a/test/elf/defsym-lto.sh
+++ b/test/defsym-lto.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $CC -flto -o /dev/null -xc - >& /dev/null \
-  || skip
+test_cflags -flto || skip
 
 cat <<EOF | $CC -flto -fPIC -o $t/a.o -c -xc -
 #include <stdio.h>
diff --git a/test/elf/defsym-missing-symbol.sh b/test/defsym-missing-symbol.sh
similarity index 100%
rename from test/elf/defsym-missing-symbol.sh
rename to test/defsym-missing-symbol.sh
diff --git a/test/elf/defsym.sh b/test/defsym.sh
similarity index 100%
rename from test/elf/defsym.sh
rename to test/defsym.sh
diff --git a/test/elf/defsym2.sh b/test/defsym2.sh
similarity index 100%
rename from test/elf/defsym2.sh
rename to test/defsym2.sh
diff --git a/test/elf/demangle-cpp.sh b/test/demangle-cpp.sh
similarity index 100%
rename from test/elf/demangle-cpp.sh
rename to test/demangle-cpp.sh
diff --git a/test/elf/demangle-rust.sh b/test/demangle-rust.sh
similarity index 100%
rename from test/elf/demangle-rust.sh
rename to test/demangle-rust.sh
diff --git a/test/elf/demangle.sh b/test/demangle.sh
similarity index 100%
rename from test/elf/demangle.sh
rename to test/demangle.sh
diff --git a/test/elf/dependency-file-response-file.sh b/test/dependency-file-response-file.sh
similarity index 100%
rename from test/elf/dependency-file-response-file.sh
rename to test/dependency-file-response-file.sh
diff --git a/test/elf/dependency-file.sh b/test/dependency-file.sh
similarity index 100%
rename from test/elf/dependency-file.sh
rename to test/dependency-file.sh
diff --git a/test/elf/disable-new-dtags.sh b/test/disable-new-dtags.sh
similarity index 100%
rename from test/elf/disable-new-dtags.sh
rename to test/disable-new-dtags.sh
diff --git a/test/elf/discard.sh b/test/discard.sh
similarity index 91%
rename from test/elf/discard.sh
rename to test/discard.sh
index e419838b..b7628c7e 100755
--- a/test/elf/discard.sh
+++ b/test/discard.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-[ $MACHINE = riscv64 -o $MACHINE = riscv32 ] && skip
+[[ $MACHINE = riscv* ]] && skip
+[[ $MACHINE = loongarch* ]] && skip
 
 cat <<EOF | $CC -o $t/a.o -c -x assembler -Wa,-L -
   .text
diff --git a/test/elf/dso-undef.sh b/test/dso-undef.sh
similarity index 100%
rename from test/elf/dso-undef.sh
rename to test/dso-undef.sh
diff --git a/test/elf/dt-init.sh b/test/dt-init.sh
similarity index 100%
rename from test/elf/dt-init.sh
rename to test/dt-init.sh
diff --git a/test/elf/dt-needed.sh b/test/dt-needed.sh
similarity index 100%
rename from test/elf/dt-needed.sh
rename to test/dt-needed.sh
diff --git a/test/elf/duplicate-error-archive.sh b/test/duplicate-error-archive.sh
similarity index 100%
rename from test/elf/duplicate-error-archive.sh
rename to test/duplicate-error-archive.sh
diff --git a/test/elf/duplicate-error.sh b/test/duplicate-error.sh
similarity index 100%
rename from test/elf/duplicate-error.sh
rename to test/duplicate-error.sh
diff --git a/test/elf/dynamic-dt-debug.sh b/test/dynamic-dt-debug.sh
similarity index 100%
rename from test/elf/dynamic-dt-debug.sh
rename to test/dynamic-dt-debug.sh
diff --git a/test/elf/dynamic-linker.sh b/test/dynamic-linker.sh
similarity index 100%
rename from test/elf/dynamic-linker.sh
rename to test/dynamic-linker.sh
diff --git a/test/dynamic-list-data.sh b/test/dynamic-list-data.sh
new file mode 100755
index 00000000..b76c613a
--- /dev/null
+++ b/test/dynamic-list-data.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+int foo = 5;
+void bar() {}
+int main() {}
+EOF
+
+$CC -B. -o $t/exe $t/a.o -Wl,-dynamic-list-data
+readelf -W --dyn-syms $t/exe > $t/log
+grep -wq foo $t/log
+! grep -wq bar $t/log || false
diff --git a/test/elf/dynamic-list.sh b/test/dynamic-list.sh
similarity index 100%
rename from test/elf/dynamic-list.sh
rename to test/dynamic-list.sh
diff --git a/test/elf/dynamic-list2.sh b/test/dynamic-list2.sh
similarity index 100%
rename from test/elf/dynamic-list2.sh
rename to test/dynamic-list2.sh
diff --git a/test/elf/dynamic-list3.sh b/test/dynamic-list3.sh
similarity index 100%
rename from test/elf/dynamic-list3.sh
rename to test/dynamic-list3.sh
diff --git a/test/elf/dynamic-list4.sh b/test/dynamic-list4.sh
similarity index 100%
rename from test/elf/dynamic-list4.sh
rename to test/dynamic-list4.sh
diff --git a/test/elf/dynamic.sh b/test/dynamic.sh
similarity index 84%
rename from test/elf/dynamic.sh
rename to test/dynamic.sh
index ce207c6a..2b9576c2 100755
--- a/test/elf/dynamic.sh
+++ b/test/dynamic.sh
@@ -9,7 +9,7 @@ readelf --dynamic $t/exe > $t/log
 grep -Eq 'Shared library:.*\blibc\b' $t/log
 
 readelf -W --dyn-syms --use-dynamic $t/exe > $t/log2
-grep -Eq 'FUNC\s+GLOBAL\s+DEFAULT.*UND\s+__libc_start_main' $t/log2
+grep -Eq 'FUNC\s+GLOBAL\s+DEFAULT.*UND\s+__libc_start' $t/log2
 
 cat <<EOF | $CC -c -fPIC -o $t/b.o -xc -
 #include <stdio.h>
diff --git a/test/elf/mold-jobs.sh b/test/elf/mold-jobs.sh
deleted file mode 100755
index 46af6628..00000000
--- a/test/elf/mold-jobs.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-# Looks like lockf doesn't work correctly on qemu-riscv64
-[ $MACHINE = riscv64 -o $MACHINE = riscv32 ] && skip
-
-cat <<EOF | $CC -o $t/a.o -c -xc - -fno-PIE
-#include <stdio.h>
-int main() {
-  printf("Hello world\n");
-}
-EOF
-
-for i in `seq 1 20`; do
-  rm -f $t/exe$i
-  ( MOLD_JOBS=2 $CC -B. -o $t/exe$i $t/a.o -no-pie; echo $i) &
-done
-
-wait
-
-for i in `seq 1 20`; do
-  $QEMU $t/exe$i | grep -q 'Hello world'
-done
diff --git a/test/elf/now.sh b/test/elf/now.sh
deleted file mode 100755
index 37b83d26..00000000
--- a/test/elf/now.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-cat <<EOF | $CC -c -fPIC -o $t/a.o -xc -
-#include <stdio.h>
-
-void foo() {
-  printf("Hello world\n");
-}
-EOF
-
-$CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,now
-readelf --dynamic $t/b.so | grep -q 'Flags: NOW'
-
-$CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,now,-z,lazy
-readelf --dynamic $t/b.so > $t/log
-! grep -q 'Flags: NOW' $t/log || false
diff --git a/test/elf/pack-dyn-relocs-relr.sh b/test/elf/pack-dyn-relocs-relr.sh
deleted file mode 100755
index c2cad3f8..00000000
--- a/test/elf/pack-dyn-relocs-relr.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-[ $MACHINE = m68k ] && skip
-[ $MACHINE = ppc ] && skip
-
-command -v llvm-readelf >& /dev/null || skip
-
-cat <<EOF | $CC -o $t/a.o -fPIC -c -xc -
-#include <stdio.h>
-int main() {
-  printf("Hello world\n");
-}
-EOF
-
-$CC -B. -o $t/exe1 $t/a.o -pie
-llvm-readelf -r $t/exe1 | grep RELATIVE | wc -l > $t/log1
-
-$CC -B. -o $t/exe2 $t/a.o -pie -Wl,-pack-dyn-relocs=relr
-llvm-readelf -r $t/exe2 | grep RELATIVE | wc -l > $t/log2
-
-diff $t/log1 $t/log2
-
-llvm-readelf --dynamic $t/exe2 > $t/log3
-grep -wq RELR $t/log3
-grep -wq RELRSZ $t/log3
-grep -wq RELRENT $t/log3
diff --git a/test/elf/package-metadata.sh b/test/elf/package-metadata.sh
deleted file mode 100755
index 4c673bc5..00000000
--- a/test/elf/package-metadata.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-cat <<EOF | $CC -o $t/a.o -c -xc -
-#include <stdio.h>
-int main() {
-  printf("Hello world\n");
-}
-EOF
-
-$CC -B. -o $t/exe $t/a.o -Wl,-package-metadata='{"foo":"bar"}'
-readelf -x .note.package $t/exe | grep -Fq '{"foo":"bar"}'
diff --git a/test/elf/relocatable-no-ehframe.sh b/test/elf/relocatable-no-ehframe.sh
deleted file mode 100755
index d7c2e1a6..00000000
--- a/test/elf/relocatable-no-ehframe.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-[ $MACHINE = alpha ] && skip
-
-# OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
-
-cat <<EOF | $CC -c -o $t/a.o -xc -fno-unwind-tables -fno-asynchronous-unwind-tables -
-int foo() { return 1; }
-EOF
-
-readelf -WS $t/a.o > $t/log1
-! grep -Fq .eh_frame $t/log1 || false
-
-./mold --relocatable -o $t/b.o $t/a.o
-readelf -WS $t/b.o > $t/log2
-! grep -Fq .eh_frame $t/log2 || false
diff --git a/test/elf/shared-abs-sym.sh b/test/elf/shared-abs-sym.sh
deleted file mode 100755
index f462130c..00000000
--- a/test/elf/shared-abs-sym.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-cat <<EOF | $CC -B. -fPIC -shared -o $t/a.so -xassembler -
-.globl foo
-foo = 3;
-EOF
-
-cat <<EOF | $CC -B. -fPIC -shared -o $t/b.so -xassembler -
-.globl foo
-foo = 5;
-EOF
-
-cat <<EOF | $CC -fPIC -c -o $t/c.o -xc -
-#include <stdio.h>
-extern char foo;
-int main() { printf("foo=%p\n", &foo); }
-EOF
-
-# This test fails with older glibc
-$CC -B. -o $t/exe1 -pie $t/c.o $t/a.so 2> /dev/null || skip
-$QEMU $t/exe1 | grep -q 'foo=0x3' || skip
-LD_PRELOAD=$t/b.so $QEMU $t/exe1 | grep -q 'foo=0x5'
-
-$CC -B. -o $t/exe2 -pie $t/c.o $t/a.so
-$QEMU $t/exe2 | grep -q 'foo=0x3'
-LD_PRELOAD=$t/b.so $QEMU $t/exe2 | grep -q 'foo=0x5'
-
-$CC -B. -o $t/exe3 -no-pie $t/c.o $t/a.so
-$QEMU $t/exe3 | grep -q 'foo=0x3'
-LD_PRELOAD=$t/b.so $QEMU $t/exe3 | grep -q 'foo=0x5'
diff --git a/test/elf/z-pack-relative-relocs.sh b/test/elf/z-pack-relative-relocs.sh
deleted file mode 100755
index e09d441e..00000000
--- a/test/elf/z-pack-relative-relocs.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-cat <<EOF | $CC -o $t/a.o -fPIC -c -xc -
-#include <stdio.h>
-int main() {
-  printf("Hello world\n");
-}
-EOF
-
-$CC -B. -o $t/exe $t/a.o -pie -Wl,-z,pack-relative-relocs
-
-readelf -W -V $t/exe > $t/log
-grep -Fq GLIBC_2. $t/log || skip
-
-grep -q GLIBC_ABI_DT_RELR $t/log
diff --git a/test/elf/emit-relocs-cpp.sh b/test/emit-relocs-cpp.sh
similarity index 100%
rename from test/elf/emit-relocs-cpp.sh
rename to test/emit-relocs-cpp.sh
diff --git a/test/elf/emit-relocs-dead-sections.sh b/test/emit-relocs-dead-sections.sh
similarity index 100%
rename from test/elf/emit-relocs-dead-sections.sh
rename to test/emit-relocs-dead-sections.sh
diff --git a/test/elf/emit-relocs.sh b/test/emit-relocs.sh
similarity index 100%
rename from test/elf/emit-relocs.sh
rename to test/emit-relocs.sh
diff --git a/test/empty-arg.sh b/test/empty-arg.sh
new file mode 100755
index 00000000..60182b07
--- /dev/null
+++ b/test/empty-arg.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+! ./mold -m elf_x86_64 '' >& $t/log
+grep -q 'cannot open :' $t/log
diff --git a/test/elf/empty-file.sh b/test/empty-file.sh
similarity index 100%
rename from test/elf/empty-file.sh
rename to test/empty-file.sh
diff --git a/test/elf/empty-input.sh b/test/empty-input.sh
similarity index 100%
rename from test/elf/empty-input.sh
rename to test/empty-input.sh
diff --git a/test/elf/empty-version.sh b/test/empty-version.sh
similarity index 100%
rename from test/elf/empty-version.sh
rename to test/empty-version.sh
diff --git a/test/elf/entry.sh b/test/entry.sh
similarity index 100%
rename from test/elf/entry.sh
rename to test/entry.sh
diff --git a/test/elf/exception-multiple-ehframe.sh b/test/exception-multiple-ehframe.sh
similarity index 96%
rename from test/elf/exception-multiple-ehframe.sh
rename to test/exception-multiple-ehframe.sh
index 1b9f434a..c411eb92 100755
--- a/test/elf/exception-multiple-ehframe.sh
+++ b/test/exception-multiple-ehframe.sh
@@ -5,7 +5,6 @@ nm mold | grep -q '__tsan_init' && skip
 
 command -v perl > /dev/null || skip
 
-[ $MACHINE = m68k ] && skip
 [ $MACHINE = sh4 ] && skip
 
 cat <<EOF | $CXX -o $t/a.o -c -xc++ -
diff --git a/test/elf/exception.sh b/test/exception.sh
similarity index 95%
rename from test/elf/exception.sh
rename to test/exception.sh
index 6761a76d..e054ec52 100755
--- a/test/elf/exception.sh
+++ b/test/exception.sh
@@ -1,9 +1,6 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-[ $MACHINE = m68k ] && skip
-[ $MACHINE = sh4 ] && skip
-
 static=
 test_cxxflags -static && static=-static
 
diff --git a/test/elf/exclude-libs.sh b/test/exclude-libs.sh
similarity index 84%
rename from test/elf/exclude-libs.sh
rename to test/exclude-libs.sh
index eb390310..39243991 100755
--- a/test/elf/exclude-libs.sh
+++ b/test/exclude-libs.sh
@@ -48,6 +48,12 @@ readelf --dyn-syms $t/f.so > $t/log
 ! grep -Fq bar $t/log || false
 grep -Fq baz $t/log
 
+$CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=c.a:d.a
+readelf --dyn-syms $t/f.so > $t/log
+! grep -Fq foo $t/log || false
+! grep -Fq bar $t/log || false
+grep -Fq baz $t/log
+
 $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=ALL
 readelf --dyn-syms $t/f.so > $t/log
 ! grep -Fq foo $t/log || false
diff --git a/test/elf/exclude-libs2.sh b/test/exclude-libs2.sh
similarity index 100%
rename from test/elf/exclude-libs2.sh
rename to test/exclude-libs2.sh
diff --git a/test/elf/exclude-libs3.sh b/test/exclude-libs3.sh
similarity index 100%
rename from test/elf/exclude-libs3.sh
rename to test/exclude-libs3.sh
diff --git a/test/elf/execstack.sh b/test/execstack.sh
similarity index 100%
rename from test/elf/execstack.sh
rename to test/execstack.sh
diff --git a/test/elf/execute-only.sh b/test/execute-only.sh
similarity index 100%
rename from test/elf/execute-only.sh
rename to test/execute-only.sh
diff --git a/test/elf/export-dynamic.sh b/test/export-dynamic.sh
similarity index 100%
rename from test/elf/export-dynamic.sh
rename to test/export-dynamic.sh
diff --git a/test/elf/export-from-exe.sh b/test/export-from-exe.sh
similarity index 100%
rename from test/elf/export-from-exe.sh
rename to test/export-from-exe.sh
diff --git a/test/elf/fatal-warnings.sh b/test/fatal-warnings.sh
similarity index 100%
rename from test/elf/fatal-warnings.sh
rename to test/fatal-warnings.sh
diff --git a/test/elf/filler.sh b/test/filler.sh
similarity index 100%
rename from test/elf/filler.sh
rename to test/filler.sh
diff --git a/test/elf/filter.sh b/test/filter.sh
similarity index 100%
rename from test/elf/filter.sh
rename to test/filter.sh
diff --git a/test/elf/func-addr.sh b/test/func-addr.sh
similarity index 100%
rename from test/elf/func-addr.sh
rename to test/func-addr.sh
diff --git a/test/elf/gc-sections.sh b/test/gc-sections.sh
similarity index 100%
rename from test/elf/gc-sections.sh
rename to test/gc-sections.sh
diff --git a/test/elf/gdb-index-compress-output.sh b/test/gdb-index-compress-output.sh
similarity index 100%
rename from test/elf/gdb-index-compress-output.sh
rename to test/gdb-index-compress-output.sh
diff --git a/test/elf/gdb-index-dwarf2.sh b/test/gdb-index-dwarf2.sh
similarity index 100%
rename from test/elf/gdb-index-dwarf2.sh
rename to test/gdb-index-dwarf2.sh
diff --git a/test/elf/gdb-index-dwarf3.sh b/test/gdb-index-dwarf3.sh
similarity index 100%
rename from test/elf/gdb-index-dwarf3.sh
rename to test/gdb-index-dwarf3.sh
diff --git a/test/elf/gdb-index-dwarf4.sh b/test/gdb-index-dwarf4.sh
similarity index 100%
rename from test/elf/gdb-index-dwarf4.sh
rename to test/gdb-index-dwarf4.sh
diff --git a/test/elf/gdb-index-dwarf5.sh b/test/gdb-index-dwarf5.sh
similarity index 86%
rename from test/elf/gdb-index-dwarf5.sh
rename to test/gdb-index-dwarf5.sh
index b7ec1af5..1f3ebc84 100755
--- a/test/elf/gdb-index-dwarf5.sh
+++ b/test/gdb-index-dwarf5.sh
@@ -65,6 +65,8 @@ $CC -c -o $t/d.o $t/d.c -fPIC -g -ggnu-pubnames -gdwarf-5 -ffunction-sections
 
 $CC -B. -shared -o $t/e.so $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--gdb-index
 readelf -WS $t/e.so 2> /dev/null | grep -Fq .gdb_index
+readelf --debug=gdb_index $t/e.so 2> /dev/null | grep -q 'fn1: .* \[global, function\]'
+readelf --debug=gdb_index $t/e.so 2> /dev/null | grep -q 'char: .* \[static, type\]'
 
 cat <<EOF | $CC -c -o $t/f.o -fPIC -g -ggnu-pubnames -gdwarf-5 -xc - -gz
 void fn1();
@@ -76,6 +78,7 @@ EOF
 
 $CC -B. -o $t/exe $t/e.so $t/f.o -Wl,--gdb-index
 readelf -WS $t/exe 2> /dev/null | grep -Fq .gdb_index
+readelf --debug=gdb_index $t/exe 2> /dev/null | grep -q 'main: .* \[global, function\]'
 
 $QEMU $t/exe | grep -q 'Hello world'
 
diff --git a/test/elf/gdb-index-dwarf64.sh b/test/gdb-index-dwarf64.sh
similarity index 100%
rename from test/elf/gdb-index-dwarf64.sh
rename to test/gdb-index-dwarf64.sh
diff --git a/test/elf/gdb-index-empty.sh b/test/gdb-index-empty.sh
similarity index 100%
rename from test/elf/gdb-index-empty.sh
rename to test/gdb-index-empty.sh
diff --git a/test/elf/gdb-index-split-dwarf.sh b/test/gdb-index-split-dwarf.sh
similarity index 100%
rename from test/elf/gdb-index-split-dwarf.sh
rename to test/gdb-index-split-dwarf.sh
diff --git a/test/elf/glibc-2.22-bug.sh b/test/glibc-2.22-bug.sh
similarity index 94%
rename from test/elf/glibc-2.22-bug.sh
rename to test/glibc-2.22-bug.sh
index 1539d209..27820acc 100755
--- a/test/elf/glibc-2.22-bug.sh
+++ b/test/glibc-2.22-bug.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-[ $MACHINE = alpha ] && skip
 
 # glibc 2.22 or prior have a bug that ld-linux.so.2 crashes on dlopen()
 # if .rela.dyn and .rela.plt are not contiguous in a given DSO.
diff --git a/test/elf/global-offset-table.sh b/test/global-offset-table.sh
similarity index 100%
rename from test/elf/global-offset-table.sh
rename to test/global-offset-table.sh
diff --git a/test/elf/gnu-hash.sh b/test/gnu-hash.sh
similarity index 100%
rename from test/elf/gnu-hash.sh
rename to test/gnu-hash.sh
diff --git a/test/gnu-property.sh b/test/gnu-property.sh
new file mode 100755
index 00000000..aff85c01
--- /dev/null
+++ b/test/gnu-property.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+int main() {}
+EOF
+
+$CC -B. -o $t/exe $t/a.o -no-pie
+readelf -W --sections $t/exe | grep -Fqw .note.gnu.property || skip
+readelf -W --segments $t/exe | grep -qw GNU_PROPERTY
diff --git a/test/elf/gnu-retain.sh b/test/gnu-retain.sh
similarity index 100%
rename from test/elf/gnu-retain.sh
rename to test/gnu-retain.sh
diff --git a/test/elf/gnu-unique.sh b/test/gnu-unique.sh
similarity index 100%
rename from test/elf/gnu-unique.sh
rename to test/gnu-unique.sh
diff --git a/test/elf/gnu-warning.sh b/test/gnu-warning.sh
similarity index 100%
rename from test/elf/gnu-warning.sh
rename to test/gnu-warning.sh
diff --git a/test/elf/hash-style.sh b/test/hash-style.sh
similarity index 100%
rename from test/elf/hash-style.sh
rename to test/hash-style.sh
diff --git a/test/elf/hello-dynamic.sh b/test/hello-dynamic.sh
similarity index 100%
rename from test/elf/hello-dynamic.sh
rename to test/hello-dynamic.sh
diff --git a/test/elf/hello-static.sh b/test/hello-static.sh
similarity index 100%
rename from test/elf/hello-static.sh
rename to test/hello-static.sh
diff --git a/test/elf/help.sh b/test/help.sh
similarity index 100%
rename from test/elf/help.sh
rename to test/help.sh
diff --git a/test/hidden-archive.sh b/test/hidden-archive.sh
new file mode 100755
index 00000000..9364e198
--- /dev/null
+++ b/test/hidden-archive.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -c -o $t/a.o -fPIC -xc -
+void foo() {}
+EOF
+
+rm -f $t/b.a
+ar rcs $t/b.a $t/a.o
+
+cat <<EOF | $CC -shared -o $t/c.so -fPIC -xc -
+void foo() {}
+EOF
+
+cat <<EOF | $CC -o $t/d.o -fPIC -c -xc -
+__attribute__((visibility("hidden"))) void foo();
+int main() { foo(); }
+EOF
+
+$CC -B. -o $t/exe $t/d.o $t/c.so $t/b.a
+$QEMU $t/exe
diff --git a/test/elf/hidden-undef.sh b/test/hidden-undef.sh
similarity index 100%
rename from test/elf/hidden-undef.sh
rename to test/hidden-undef.sh
diff --git a/test/elf/hidden-weak-undef.sh b/test/hidden-weak-undef.sh
similarity index 100%
rename from test/elf/hidden-weak-undef.sh
rename to test/hidden-weak-undef.sh
diff --git a/test/elf/icf-safe.sh b/test/icf-safe.sh
similarity index 100%
rename from test/elf/icf-safe.sh
rename to test/icf-safe.sh
diff --git a/test/elf/icf-small.sh b/test/icf-small.sh
similarity index 100%
rename from test/elf/icf-small.sh
rename to test/icf-small.sh
diff --git a/test/elf/icf.sh b/test/icf.sh
similarity index 100%
rename from test/elf/icf.sh
rename to test/icf.sh
diff --git a/test/elf/ifunc-address-equality-exported.sh b/test/ifunc-address-equality-exported.sh
similarity index 100%
rename from test/elf/ifunc-address-equality-exported.sh
rename to test/ifunc-address-equality-exported.sh
diff --git a/test/elf/ifunc-address-equality.sh b/test/ifunc-address-equality.sh
similarity index 100%
rename from test/elf/ifunc-address-equality.sh
rename to test/ifunc-address-equality.sh
diff --git a/test/elf/ifunc-alias.sh b/test/ifunc-alias.sh
similarity index 100%
rename from test/elf/ifunc-alias.sh
rename to test/ifunc-alias.sh
diff --git a/test/elf/ifunc-dlopen.sh b/test/ifunc-dlopen.sh
similarity index 100%
rename from test/elf/ifunc-dlopen.sh
rename to test/ifunc-dlopen.sh
diff --git a/test/elf/ifunc-dso.sh b/test/ifunc-dso.sh
similarity index 100%
rename from test/elf/ifunc-dso.sh
rename to test/ifunc-dso.sh
diff --git a/test/elf/ifunc-dynamic.sh b/test/ifunc-dynamic.sh
similarity index 100%
rename from test/elf/ifunc-dynamic.sh
rename to test/ifunc-dynamic.sh
diff --git a/test/elf/ifunc-export.sh b/test/ifunc-export.sh
similarity index 100%
rename from test/elf/ifunc-export.sh
rename to test/ifunc-export.sh
diff --git a/test/elf/ifunc-funcptr.sh b/test/ifunc-funcptr.sh
similarity index 100%
rename from test/elf/ifunc-funcptr.sh
rename to test/ifunc-funcptr.sh
diff --git a/test/elf/ifunc-noplt.sh b/test/ifunc-noplt.sh
similarity index 100%
rename from test/elf/ifunc-noplt.sh
rename to test/ifunc-noplt.sh
diff --git a/test/elf/ifunc-static-pie.sh b/test/ifunc-static-pie.sh
similarity index 100%
rename from test/elf/ifunc-static-pie.sh
rename to test/ifunc-static-pie.sh
diff --git a/test/elf/ifunc-static.sh b/test/ifunc-static.sh
similarity index 100%
rename from test/elf/ifunc-static.sh
rename to test/ifunc-static.sh
diff --git a/test/elf/image-base.sh b/test/image-base.sh
similarity index 100%
rename from test/elf/image-base.sh
rename to test/image-base.sh
diff --git a/test/elf/init-array-priorities.sh b/test/init-array-priorities.sh
similarity index 100%
rename from test/elf/init-array-priorities.sh
rename to test/init-array-priorities.sh
diff --git a/test/elf/init-in-dso.sh b/test/init-in-dso.sh
similarity index 100%
rename from test/elf/init-in-dso.sh
rename to test/init-in-dso.sh
diff --git a/test/elf/init.sh b/test/init.sh
similarity index 100%
rename from test/elf/init.sh
rename to test/init.sh
diff --git a/test/elf/initfirst.sh b/test/initfirst.sh
similarity index 100%
rename from test/elf/initfirst.sh
rename to test/initfirst.sh
diff --git a/test/elf/interpose.sh b/test/interpose.sh
similarity index 100%
rename from test/elf/interpose.sh
rename to test/interpose.sh
diff --git a/test/elf/invalid-version-script.sh b/test/invalid-version-script.sh
similarity index 100%
rename from test/elf/invalid-version-script.sh
rename to test/invalid-version-script.sh
diff --git a/test/elf/issue646.sh b/test/issue646.sh
similarity index 89%
rename from test/elf/issue646.sh
rename to test/issue646.sh
index e419d4a7..a33f473e 100755
--- a/test/elf/issue646.sh
+++ b/test/issue646.sh
@@ -1,9 +1,6 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-[ $MACHINE = m68k ] && skip
-[ $MACHINE = sh4 ] && skip
-
 cat <<EOF | $CXX -o $t/a.o -c -xc++ -
 #include <iostream>
 #include <stdexcept>
diff --git a/test/elf/large-alignment-dso.sh b/test/large-alignment-dso.sh
similarity index 100%
rename from test/elf/large-alignment-dso.sh
rename to test/large-alignment-dso.sh
diff --git a/test/elf/large-alignment.sh b/test/large-alignment.sh
similarity index 100%
rename from test/elf/large-alignment.sh
rename to test/large-alignment.sh
diff --git a/test/elf/large-max-page-size-strip.sh b/test/large-max-page-size-strip.sh
similarity index 100%
rename from test/elf/large-max-page-size-strip.sh
rename to test/large-max-page-size-strip.sh
diff --git a/test/elf/large-max-page-size.sh b/test/large-max-page-size.sh
similarity index 100%
rename from test/elf/large-max-page-size.sh
rename to test/large-max-page-size.sh
diff --git a/test/elf/large-text.sh b/test/large-text.sh
similarity index 100%
rename from test/elf/large-text.sh
rename to test/large-text.sh
diff --git a/test/library.sh b/test/library.sh
new file mode 100755
index 00000000..91d40bff
--- /dev/null
+++ b/test/library.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -fPIC
+#include <stdio.h>
+void hello() {
+  printf("Hello world\n");
+}
+EOF
+
+$CC -B. -shared -o $t/libfoobar.so $t/a.o
+
+cat <<EOF | $CC -o $t/c.o -c -xc -
+void hello();
+int main() { hello(); }
+EOF
+
+$CC -B. -o $t/exe1 $t/c.o -L$t -Wl,--library,foobar -Wl,-rpath,$t
+$QEMU $t/exe1 | grep -q 'Hello world'
+
+$CC -B. -o $t/exe2 $t/c.o -L$t -Wl,--library=foobar -Wl,-rpath,$t
+$QEMU $t/exe2 | grep -q 'Hello world'
diff --git a/test/elf/link-order.sh b/test/link-order.sh
similarity index 100%
rename from test/elf/link-order.sh
rename to test/link-order.sh
diff --git a/test/elf/linker-script-defsym.sh b/test/linker-script-defsym.sh
similarity index 100%
rename from test/elf/linker-script-defsym.sh
rename to test/linker-script-defsym.sh
diff --git a/test/elf/linker-script-error.sh b/test/linker-script-error.sh
similarity index 100%
rename from test/elf/linker-script-error.sh
rename to test/linker-script-error.sh
diff --git a/test/elf/linker-script-relocatable.sh b/test/linker-script-relocatable.sh
similarity index 100%
rename from test/elf/linker-script-relocatable.sh
rename to test/linker-script-relocatable.sh
diff --git a/test/elf/linker-script.sh b/test/linker-script.sh
similarity index 100%
rename from test/elf/linker-script.sh
rename to test/linker-script.sh
diff --git a/test/elf/linker-script2.sh b/test/linker-script2.sh
similarity index 100%
rename from test/elf/linker-script2.sh
rename to test/linker-script2.sh
diff --git a/test/elf/linker-script3.sh b/test/linker-script3.sh
similarity index 100%
rename from test/elf/linker-script3.sh
rename to test/linker-script3.sh
diff --git a/test/elf/linker-script4.sh b/test/linker-script4.sh
similarity index 100%
rename from test/elf/linker-script4.sh
rename to test/linker-script4.sh
diff --git a/test/elf/linker-script5.sh b/test/linker-script5.sh
similarity index 100%
rename from test/elf/linker-script5.sh
rename to test/linker-script5.sh
diff --git a/test/elf/linker-script6.sh b/test/linker-script6.sh
similarity index 100%
rename from test/elf/linker-script6.sh
rename to test/linker-script6.sh
diff --git a/test/elf/lto-archive.sh b/test/lto-archive.sh
similarity index 87%
rename from test/elf/lto-archive.sh
rename to test/lto-archive.sh
index 88ce90f0..3938a9b2 100755
--- a/test/elf/lto-archive.sh
+++ b/test/lto-archive.sh
@@ -2,9 +2,7 @@
 . $(dirname $0)/common.inc
 
 [ "$CC" = cc ] || skip
-
-echo 'int main() {}' | $CC -flto -o /dev/null -xc - >& /dev/null \
-  || skip
+test_cflags -flto || skip
 
 cat <<EOF | $CC -o $t/a.o -c -flto -xc -
 #include <stdio.h>
diff --git a/test/elf/lto-archive2.sh b/test/lto-archive2.sh
similarity index 73%
rename from test/elf/lto-archive2.sh
rename to test/lto-archive2.sh
index 43572941..8a63b9d7 100755
--- a/test/elf/lto-archive2.sh
+++ b/test/lto-archive2.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $CC -flto=auto -o /dev/null -xc - >& /dev/null || skip
+test_cflags -flto=auto || skip
 
 echo | $CC -o $t/a.o -c -flto=auto -xc -
 
diff --git a/test/elf/lto-dso.sh b/test/lto-dso.sh
similarity index 80%
rename from test/elf/lto-dso.sh
rename to test/lto-dso.sh
index 5fe3c4d5..61c27794 100755
--- a/test/elf/lto-dso.sh
+++ b/test/lto-dso.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $CC -flto -o /dev/null -xc - >& /dev/null \
-  || skip
+test_cflags -flto || skip
 
 cat <<EOF | $CC -flto -c -fPIC -o $t/a.o -xc -
 void foo() {}
diff --git a/test/elf/lto-gcc.sh b/test/lto-gcc.sh
similarity index 91%
rename from test/elf/lto-gcc.sh
rename to test/lto-gcc.sh
index 6fbabb4f..105045d1 100755
--- a/test/elf/lto-gcc.sh
+++ b/test/lto-gcc.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $GCC -flto -o /dev/null -xc - >& /dev/null \
+echo 'int main() {}' | $GCC -B. -flto -o /dev/null -xc - >& /dev/null \
   || skip
 
 cat <<EOF | $GCC -flto -c -o $t/a.o -xc -
diff --git a/test/elf/lto-llvm.sh b/test/lto-llvm.sh
similarity index 78%
rename from test/elf/lto-llvm.sh
rename to test/lto-llvm.sh
index 6d54f0c3..f32c6612 100755
--- a/test/elf/lto-llvm.sh
+++ b/test/lto-llvm.sh
@@ -3,7 +3,7 @@
 
 [ $MACHINE = $(uname -m) ] || skip
 
-echo 'int main() {}' | clang -flto -o /dev/null -xc - >& /dev/null \
+echo 'int main() {}' | clang -B. -flto -o /dev/null -xc - >& /dev/null \
   || skip
 
 cat <<EOF | clang -flto -c -o $t/a.o -xc -
diff --git a/test/elf/lto-nostdlib.sh b/test/lto-nostdlib.sh
similarity index 87%
rename from test/elf/lto-nostdlib.sh
rename to test/lto-nostdlib.sh
index 069df068..dc0eaf66 100755
--- a/test/elf/lto-nostdlib.sh
+++ b/test/lto-nostdlib.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
+test_cflags -flto || skip
+
 cat <<EOF | $CC -flto -c -o $t/a.o -xc -
 void _start() {}
 EOF
diff --git a/test/elf/lto-version-script.sh b/test/lto-version-script.sh
similarity index 95%
rename from test/elf/lto-version-script.sh
rename to test/lto-version-script.sh
index 164ea0f9..a6fce5fd 100755
--- a/test/elf/lto-version-script.sh
+++ b/test/lto-version-script.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
+test_cflags -flto || skip
+
 cat <<EOF | $CC -flto -c -fPIC -o $t/a.o -xc -
 void foo() {}
 void bar() {}
diff --git a/test/elf/main-in-dso.sh b/test/main-in-dso.sh
similarity index 100%
rename from test/elf/main-in-dso.sh
rename to test/main-in-dso.sh
diff --git a/test/elf/many-sections.sh b/test/many-sections.sh
similarity index 100%
rename from test/elf/many-sections.sh
rename to test/many-sections.sh
diff --git a/test/elf/many-sections2.sh b/test/many-sections2.sh
similarity index 82%
rename from test/elf/many-sections2.sh
rename to test/many-sections2.sh
index 891d21c2..7d37be19 100755
--- a/test/elf/many-sections2.sh
+++ b/test/many-sections2.sh
@@ -5,7 +5,7 @@
 nm mold | grep -q '__tsan_init' && skip
 
 echo 'foo = 0x1000' > $t/a.s
-seq 1 100000 | sed 's/.*/.section .data.\0,"aw"\n.globl x\0\nx\0: .word 0\n/g' >> $t/a.s
+seq 1 100000 | sed 's/.*/.section .data.&,"aw"\n.globl x&\nx&: .word 0\n/g' >> $t/a.s
 $CC -c -xassembler -o $t/a.o $t/a.s
 
 ./mold --relocatable -o $t/b.o $t/a.o
diff --git a/test/elf/mergeable-strings.sh b/test/mergeable-strings.sh
similarity index 100%
rename from test/elf/mergeable-strings.sh
rename to test/mergeable-strings.sh
diff --git a/test/elf/missing-but-ok.sh b/test/missing-but-ok.sh
similarity index 100%
rename from test/elf/missing-but-ok.sh
rename to test/missing-but-ok.sh
diff --git a/test/elf/missing-error.sh b/test/missing-error.sh
similarity index 100%
rename from test/elf/missing-error.sh
rename to test/missing-error.sh
diff --git a/test/elf/mold-wrapper.sh b/test/mold-wrapper.sh
similarity index 98%
rename from test/elf/mold-wrapper.sh
rename to test/mold-wrapper.sh
index 2bd0bb99..4748c8d7 100755
--- a/test/elf/mold-wrapper.sh
+++ b/test/mold-wrapper.sh
@@ -8,7 +8,7 @@ ldd mold-wrapper.so | grep -q libasan && skip
 nm mold | grep -q '__[at]san_init' && skip
 
 cat <<'EOF' > $t/a.sh
-#!/bin/bash
+#!/usr/bin/env bash
 echo "$0" "$@" $FOO
 EOF
 
diff --git a/test/elf/mold-wrapper2.sh b/test/mold-wrapper2.sh
similarity index 100%
rename from test/elf/mold-wrapper2.sh
rename to test/mold-wrapper2.sh
diff --git a/test/elf/nmagic.sh b/test/nmagic.sh
similarity index 100%
rename from test/elf/nmagic.sh
rename to test/nmagic.sh
diff --git a/test/no-allow-shlib-undefined.sh b/test/no-allow-shlib-undefined.sh
new file mode 100755
index 00000000..846e046a
--- /dev/null
+++ b/test/no-allow-shlib-undefined.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -B. -shared -fPIC -o $t/libfoo.so -xc -
+void foo() {}
+EOF
+
+cat <<EOF | $CC -B. -shared -fPIC -o $t/libbar.so -xc -
+void foo();
+void bar() { foo(); }
+EOF
+
+cat <<EOF | $CC -c -o $t/a.o -c -xc -
+int bar();
+int main() { bar(); }
+EOF
+
+$CC -B. -o $t/exe1 $t/a.o -Wl,--no-allow-shlib-undefined -L$t -lfoo -lbar
+
+! $CC -B. -o $t/exe2 $t/a.o -Wl,--no-allow-shlib-undefined -L$t -lbar >& $t/log || false
+grep -Fq 'undefined symbol: foo' $t/log
diff --git a/test/elf/no-eh-frame-header.sh b/test/no-eh-frame-header.sh
similarity index 100%
rename from test/elf/no-eh-frame-header.sh
rename to test/no-eh-frame-header.sh
diff --git a/test/elf/bug178.sh b/test/no-object-file.sh
similarity index 100%
rename from test/elf/bug178.sh
rename to test/no-object-file.sh
diff --git a/test/elf/no-quick-exit.sh b/test/no-quick-exit.sh
similarity index 100%
rename from test/elf/no-quick-exit.sh
rename to test/no-quick-exit.sh
diff --git a/test/elf/no-undefined-version.sh b/test/no-undefined-version.sh
similarity index 100%
rename from test/elf/no-undefined-version.sh
rename to test/no-undefined-version.sh
diff --git a/test/elf/nocopyreloc.sh b/test/nocopyreloc.sh
similarity index 95%
rename from test/elf/nocopyreloc.sh
rename to test/nocopyreloc.sh
index 06165fbc..bcfa044f 100755
--- a/test/elf/nocopyreloc.sh
+++ b/test/nocopyreloc.sh
@@ -7,7 +7,6 @@
 [ $MACHINE = ppc64 ] && skip
 [ $MACHINE = ppc64le ] && skip
 [ $MACHINE = sh4 ] && skip
-[ $MACHINE = alpha ] && skip
 [[ $MACHINE = loongarch* ]] && skip
 
 cat <<EOF | $CC -shared -o $t/a.so -xc -
diff --git a/test/elf/noinhibit-exec.sh b/test/noinhibit-exec.sh
similarity index 100%
rename from test/elf/noinhibit-exec.sh
rename to test/noinhibit-exec.sh
diff --git a/test/elf/non-canonical-plt.sh b/test/non-canonical-plt.sh
similarity index 100%
rename from test/elf/non-canonical-plt.sh
rename to test/non-canonical-plt.sh
diff --git a/test/elf/nostdlib.sh b/test/nostdlib.sh
similarity index 100%
rename from test/elf/nostdlib.sh
rename to test/nostdlib.sh
diff --git a/test/elf/oformat-binary.sh b/test/oformat-binary.sh
similarity index 100%
rename from test/elf/oformat-binary.sh
rename to test/oformat-binary.sh
diff --git a/test/elf/omagic.sh b/test/omagic.sh
similarity index 100%
rename from test/elf/omagic.sh
rename to test/omagic.sh
diff --git a/test/package-metadata.sh b/test/package-metadata.sh
new file mode 100755
index 00000000..f766b9f5
--- /dev/null
+++ b/test/package-metadata.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+#include <stdio.h>
+int main() {
+  printf("Hello world\n");
+}
+EOF
+
+$CC -B. -o $t/exe1 $t/a.o -Wl,-package-metadata='{"foo":"bar"}'
+readelf -x .note.package $t/exe1 | grep -Fq '{"foo":"bar"}'
+
+$CC -B. -o $t/exe2 $t/a.o -Wl,--encoded-package-metadata=%7B%22foo%22%3A%22bar%22%7D
+readelf -x .note.package $t/exe2 | grep -Fq '{"foo":"bar"}'
+
+! $CC -B. -o $t/exe3 $t/a.o -Wl,--encoded-package-metadata=foo%x >& $t/log
+grep -q 'invalid string: foo%x' $t/log
diff --git a/test/elf/physical-image-base.sh b/test/physical-image-base.sh
similarity index 100%
rename from test/elf/physical-image-base.sh
rename to test/physical-image-base.sh
diff --git a/test/elf/pie.sh b/test/pie.sh
similarity index 100%
rename from test/elf/pie.sh
rename to test/pie.sh
diff --git a/test/elf/plt-dso.sh b/test/plt-dso.sh
similarity index 100%
rename from test/elf/plt-dso.sh
rename to test/plt-dso.sh
diff --git a/test/elf/pltgot.sh b/test/pltgot.sh
similarity index 100%
rename from test/elf/pltgot.sh
rename to test/pltgot.sh
diff --git a/test/elf/preinit-array.sh b/test/preinit-array.sh
similarity index 100%
rename from test/elf/preinit-array.sh
rename to test/preinit-array.sh
diff --git a/test/elf/print-dependencies.sh b/test/print-dependencies.sh
similarity index 100%
rename from test/elf/print-dependencies.sh
rename to test/print-dependencies.sh
diff --git a/test/elf/protected-dynsym.sh b/test/protected-dynsym.sh
similarity index 100%
rename from test/elf/protected-dynsym.sh
rename to test/protected-dynsym.sh
diff --git a/test/elf/protected.sh b/test/protected.sh
similarity index 100%
rename from test/elf/protected.sh
rename to test/protected.sh
diff --git a/test/elf/push-pop-state.sh b/test/push-pop-state.sh
similarity index 100%
rename from test/elf/push-pop-state.sh
rename to test/push-pop-state.sh
diff --git a/test/elf/range-extension-thunk.sh b/test/range-extension-thunk.sh
similarity index 85%
rename from test/elf/range-extension-thunk.sh
rename to test/range-extension-thunk.sh
index cde896e0..065287dd 100755
--- a/test/elf/range-extension-thunk.sh
+++ b/test/range-extension-thunk.sh
@@ -4,10 +4,16 @@
 # Skip if 32 bits as we use very large addresses in this test.
 [ $MACHINE = i686 ] && skip
 [ $MACHINE = riscv32 ] && skip
+[ $MACHINE = m68k ] && skip
 
 # It looks like SPARC's runtime can't handle PLT if it's too far from GOT.
 [ $MACHINE = sparc64 ] && skip
 
+# Current LoongArch compilers emit BL for function calls, but I believe
+# they'll emit PCADDU18I + JIRL (which can address PC ± 128 GiB) in the
+# future.
+[[ $MACHINE = loongarch* ]] && skip
+
 # qemu aborts with the "Unknown exception 0x5" error, although this
 # test passes on a real POWER10 machine.
 on_qemu && [ "$CPU" = power10 ] && skip
diff --git a/test/elf/range-extension-thunk2.sh b/test/range-extension-thunk2.sh
similarity index 100%
rename from test/elf/range-extension-thunk2.sh
rename to test/range-extension-thunk2.sh
diff --git a/test/elf/range-extension-thunk3.sh b/test/range-extension-thunk3.sh
similarity index 55%
rename from test/elf/range-extension-thunk3.sh
rename to test/range-extension-thunk3.sh
index bb45e594..cdc8e2b9 100755
--- a/test/elf/range-extension-thunk3.sh
+++ b/test/range-extension-thunk3.sh
@@ -1,15 +1,14 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-[ $MACHINE = alpha ] && skip
 [ $MACHINE = sh4 ] && skip
 
-seq 1 10000 | sed 's/.*/void func\0() {}/' > $t/a.c
+seq 1 10000 | sed 's/.*/void func&() {}/' > $t/a.c
 $CC -B. -o $t/b.so -shared $t/a.c
 
-seq 1 10000 | sed 's/.*/void func\0();/' > $t/c.c
+seq 1 10000 | sed 's/.*/void func&();/' > $t/c.c
 echo 'int main() {' >> $t/c.c
-seq 1 10000 | sed 's/.*/func\0();/' >> $t/c.c
+seq 1 10000 | sed 's/.*/func&();/' >> $t/c.c
 echo '}' >> $t/c.c
 
 $CC -c -o $t/d.o $t/c.c
diff --git a/test/elf/relax-got-load.sh b/test/relax-got-load.sh
similarity index 100%
rename from test/elf/relax-got-load.sh
rename to test/relax-got-load.sh
diff --git a/test/elf/reloc-rodata.sh b/test/reloc-rodata.sh
similarity index 100%
rename from test/elf/reloc-rodata.sh
rename to test/reloc-rodata.sh
diff --git a/test/elf/relocatable-archive.sh b/test/relocatable-archive.sh
similarity index 100%
rename from test/elf/relocatable-archive.sh
rename to test/relocatable-archive.sh
diff --git a/test/elf/relocatable-c++.sh b/test/relocatable-c++.sh
similarity index 88%
rename from test/elf/relocatable-c++.sh
rename to test/relocatable-c++.sh
index e20cdfe5..6ce5e7be 100755
--- a/test/elf/relocatable-c++.sh
+++ b/test/relocatable-c++.sh
@@ -4,10 +4,6 @@
 # OneTBB isn't tsan-clean
 nm mold | grep -q '__tsan_init' && skip
 
-# Ubuntu 22.04 GCC is broken
-[ $MACHINE = m68k ] && skip
-[ $MACHINE = sh4 ] && skip
-
 cat <<EOF | $CXX -c -o $t/a.o -xc++ -
 void hello();
 void world();
diff --git a/test/elf/relocatable-compressed-debug-info.sh b/test/relocatable-compressed-debug-info.sh
similarity index 100%
rename from test/elf/relocatable-compressed-debug-info.sh
rename to test/relocatable-compressed-debug-info.sh
diff --git a/test/elf/relocatable-debug-info.sh b/test/relocatable-debug-info.sh
similarity index 100%
rename from test/elf/relocatable-debug-info.sh
rename to test/relocatable-debug-info.sh
diff --git a/test/elf/relocatable-exception.sh b/test/relocatable-exception.sh
similarity index 100%
rename from test/elf/relocatable-exception.sh
rename to test/relocatable-exception.sh
diff --git a/test/elf/relocatable-many-sections.sh b/test/relocatable-many-sections.sh
similarity index 100%
rename from test/elf/relocatable-many-sections.sh
rename to test/relocatable-many-sections.sh
diff --git a/test/elf/relocatable-merge-sections.sh b/test/relocatable-merge-sections.sh
similarity index 100%
rename from test/elf/relocatable-merge-sections.sh
rename to test/relocatable-merge-sections.sh
diff --git a/test/elf/relocatable-mergeable-sections.sh b/test/relocatable-mergeable-sections.sh
similarity index 100%
rename from test/elf/relocatable-mergeable-sections.sh
rename to test/relocatable-mergeable-sections.sh
diff --git a/test/elf/relocatable.sh b/test/relocatable.sh
similarity index 100%
rename from test/elf/relocatable.sh
rename to test/relocatable.sh
diff --git a/test/elf/relro.sh b/test/relro.sh
similarity index 100%
rename from test/elf/relro.sh
rename to test/relro.sh
diff --git a/test/elf/repro.sh b/test/repro.sh
similarity index 100%
rename from test/elf/repro.sh
rename to test/repro.sh
diff --git a/test/elf/require-defined.sh b/test/require-defined.sh
similarity index 100%
rename from test/elf/require-defined.sh
rename to test/require-defined.sh
diff --git a/test/elf/response-file.sh b/test/response-file.sh
similarity index 100%
rename from test/elf/response-file.sh
rename to test/response-file.sh
diff --git a/test/elf/response-file2.sh b/test/response-file2.sh
similarity index 100%
rename from test/elf/response-file2.sh
rename to test/response-file2.sh
diff --git a/test/elf/retain-symbols-file.sh b/test/retain-symbols-file.sh
similarity index 69%
rename from test/elf/retain-symbols-file.sh
rename to test/retain-symbols-file.sh
index 91c2ddfa..dba11d3c 100755
--- a/test/elf/retain-symbols-file.sh
+++ b/test/retain-symbols-file.sh
@@ -16,8 +16,8 @@ EOF
 $CC -B. -o $t/exe $t/a.o -Wl,--retain-symbols-file=$t/symbols
 readelf -W --symbols $t/exe > $t/log
 
-! grep -qw foo $t/log || false
-! grep -qw bar $t/log || false
-! grep -qw main $t/log || false
+! grep -q ' foo$' $t/log || false
+! grep -q ' bar$' $t/log || false
+! grep -q ' main$' $t/log || false
 
-grep -qw baz $t/log
+grep -q ' baz$' $t/log
diff --git a/test/elf/reverse-sections.sh b/test/reverse-sections.sh
similarity index 100%
rename from test/elf/reverse-sections.sh
rename to test/reverse-sections.sh
diff --git a/test/elf/rodata-name.sh b/test/rodata-name.sh
similarity index 100%
rename from test/elf/rodata-name.sh
rename to test/rodata-name.sh
diff --git a/test/elf/rosegment.sh b/test/rosegment.sh
similarity index 100%
rename from test/elf/rosegment.sh
rename to test/rosegment.sh
diff --git a/test/elf/rpath.sh b/test/rpath.sh
similarity index 100%
rename from test/elf/rpath.sh
rename to test/rpath.sh
diff --git a/test/elf/run-clang.sh b/test/run-clang.sh
similarity index 100%
rename from test/elf/run-clang.sh
rename to test/run-clang.sh
diff --git a/test/elf/run.sh b/test/run.sh
similarity index 97%
rename from test/elf/run.sh
rename to test/run.sh
index b103fd07..e6257636 100755
--- a/test/elf/run.sh
+++ b/test/run.sh
@@ -16,7 +16,7 @@ int main() {
 EOF
 
 LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=`pwd`/mold \
-  $GCC -o $t/exe $t/a.o -B/usr/bin
+  $CC -o $t/exe $t/a.o -B/usr/bin
 readelf -p .comment $t/exe > $t/log
 grep -q mold $t/log
 
diff --git a/test/elf/section-align.sh b/test/section-align.sh
similarity index 100%
rename from test/elf/section-align.sh
rename to test/section-align.sh
diff --git a/test/elf/section-attributes.sh b/test/section-attributes.sh
similarity index 100%
rename from test/elf/section-attributes.sh
rename to test/section-attributes.sh
diff --git a/test/elf/section-order.sh b/test/section-order.sh
similarity index 97%
rename from test/elf/section-order.sh
rename to test/section-order.sh
index fb856ed8..989089cd 100755
--- a/test/elf/section-order.sh
+++ b/test/section-order.sh
@@ -3,6 +3,7 @@
 
 # qemu crashes if the ELF header is not mapped to memory
 on_qemu && skip
+[ "$(uname)" = FreeBSD ] && skip
 
 cat <<EOF | $CC -o $t/a.o -c -xc -fno-PIC $flags -
 #include <stdio.h>
diff --git a/test/elf/section-start.sh b/test/section-start.sh
similarity index 100%
rename from test/elf/section-start.sh
rename to test/section-start.sh
diff --git a/test/separate-debug-file.sh b/test/separate-debug-file.sh
new file mode 100755
index 00000000..7430c94e
--- /dev/null
+++ b/test/separate-debug-file.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+nm mold | grep -q '__tsan_init' && skip
+on_qemu && skip
+command -v gdb >& /dev/null || skip
+command -v flock >& /dev/null || skip
+
+cat <<EOF > $t/a.c
+#include <stdio.h>
+int main() {
+  printf("Hello world\n");
+}
+EOF
+
+$CC -c -o $t/a.o $t/a.c -g
+$CC -B. -o $t/exe1 $t/a.o -Wl,--separate-debug-file
+readelf -SW $t/exe1 | grep -Fq .gnu_debuglink
+
+flock $t/exe1 true
+gdb $t/exe1 -ex 'list main' -ex 'quit' | grep -Fq printf
+
+$CC -c -o $t/a.o $t/a.c -g
+$CC -B. -o $t/exe2 $t/a.o -Wl,--separate-debug-file -Wl,--no-build-id
+readelf -SW $t/exe2 | grep -Fq .gnu_debuglink
+
+flock $t/exe2 true
+gdb $t/exe2 -ex 'list main' -ex 'quit' | grep -Fq printf
diff --git a/test/shared-abs-sym.sh b/test/shared-abs-sym.sh
new file mode 100755
index 00000000..cc6e0b0b
--- /dev/null
+++ b/test/shared-abs-sym.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -B. -fPIC -shared -o $t/a.so -xassembler -
+.globl foo
+foo = 3;
+EOF
+
+cat <<EOF | $CC -B. -fPIC -shared -o $t/b.so -xassembler -
+.globl foo
+foo = 5;
+EOF
+
+cat <<EOF | $CC -fPIC -c -o $t/d.o -xc -
+#include <stdio.h>
+extern char foo;
+int main() { printf("foo=%p\n", &foo); }
+EOF
+
+cp $t/a.so $t/c.so
+$CC -B. -o $t/exe1 $t/d.o $t/c.so -pie || skip
+$QEMU $t/exe1 | grep -q 'foo=0x3' || skip
+cp $t/b.so $t/c.so
+$QEMU $t/exe1 | grep -q 'foo=0x5'
+
+cp $t/a.so $t/c.so
+$CC -B. -o $t/exe2 $t/d.o $t/c.so -no-pie
+$QEMU $t/exe2 | grep -q 'foo=0x3'
+cp $t/b.so $t/c.so
+$QEMU $t/exe1 | grep -q 'foo=0x5'
diff --git a/test/elf/shared.sh b/test/shared.sh
similarity index 100%
rename from test/elf/shared.sh
rename to test/shared.sh
diff --git a/test/elf/shuffle-sections-seed.sh b/test/shuffle-sections-seed.sh
similarity index 100%
rename from test/elf/shuffle-sections-seed.sh
rename to test/shuffle-sections-seed.sh
diff --git a/test/elf/shuffle-sections.sh b/test/shuffle-sections.sh
similarity index 100%
rename from test/elf/shuffle-sections.sh
rename to test/shuffle-sections.sh
diff --git a/test/elf/soname.sh b/test/soname.sh
similarity index 100%
rename from test/elf/soname.sh
rename to test/soname.sh
diff --git a/test/elf/spare-program-headers.sh b/test/spare-program-headers.sh
similarity index 100%
rename from test/elf/spare-program-headers.sh
rename to test/spare-program-headers.sh
diff --git a/test/elf/start-lib.sh b/test/start-lib.sh
similarity index 100%
rename from test/elf/start-lib.sh
rename to test/start-lib.sh
diff --git a/test/elf/start-stop-symbol.sh b/test/start-stop-symbol.sh
similarity index 100%
rename from test/elf/start-stop-symbol.sh
rename to test/start-stop-symbol.sh
diff --git a/test/elf/start-stop.sh b/test/start-stop.sh
similarity index 100%
rename from test/elf/start-stop.sh
rename to test/start-stop.sh
diff --git a/test/elf/static-archive.sh b/test/static-archive.sh
similarity index 100%
rename from test/elf/static-archive.sh
rename to test/static-archive.sh
diff --git a/test/elf/static-pie.sh b/test/static-pie.sh
similarity index 100%
rename from test/elf/static-pie.sh
rename to test/static-pie.sh
diff --git a/test/elf/stdout.sh b/test/stdout.sh
similarity index 100%
rename from test/elf/stdout.sh
rename to test/stdout.sh
diff --git a/test/elf/strip-debug.sh b/test/strip-debug.sh
similarity index 100%
rename from test/elf/strip-debug.sh
rename to test/strip-debug.sh
diff --git a/test/elf/strip.sh b/test/strip.sh
similarity index 79%
rename from test/elf/strip.sh
rename to test/strip.sh
index f39cdc39..de6b7d1e 100755
--- a/test/elf/strip.sh
+++ b/test/strip.sh
@@ -15,7 +15,7 @@ grep -Fq _start $t/log
 grep -Fq foo $t/log
 grep -Fq bar $t/log
 
-if [ $MACHINE '!=' riscv32 ] && [ $MACHINE '!=' riscv64 ]; then
+if [[ $MACHINE != riscv* ]] && [[ $MACHINE != loongarch* ]]; then
   grep -Fq .L.baz $t/log
 fi
 
@@ -25,6 +25,6 @@ readelf --symbols $t/exe > $t/log
 ! grep -Fq foo $t/log || false
 ! grep -Fq bar $t/log || false
 
-if [ $MACHINE '!=' riscv32 ] && [ $MACHINE '!=' riscv64 ]; then
+if [[ $MACHINE != riscv* ]] && [[ $MACHINE != loongarch* ]]; then
   ! grep -Fq .L.baz $t/log || false
 fi
diff --git a/test/elf/stt-common.sh b/test/stt-common.sh
similarity index 100%
rename from test/elf/stt-common.sh
rename to test/stt-common.sh
diff --git a/test/elf/symbol-rank.sh b/test/symbol-rank.sh
similarity index 100%
rename from test/elf/symbol-rank.sh
rename to test/symbol-rank.sh
diff --git a/test/elf/symbol-version-lto.sh b/test/symbol-version-lto.sh
similarity index 92%
rename from test/elf/symbol-version-lto.sh
rename to test/symbol-version-lto.sh
index f8b3f2eb..de02e456 100755
--- a/test/elf/symbol-version-lto.sh
+++ b/test/symbol-version-lto.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
+test_cflags -flto || skip
+
 cat <<EOF | $CC -fPIC -c -o $t/a.o -xc - -flto
 void foo_1() {}
 __asm__(".symver foo_1, foo@@VER1");
diff --git a/test/elf/symbol-version.sh b/test/symbol-version.sh
similarity index 100%
rename from test/elf/symbol-version.sh
rename to test/symbol-version.sh
diff --git a/test/elf/symbol-version2.sh b/test/symbol-version2.sh
similarity index 100%
rename from test/elf/symbol-version2.sh
rename to test/symbol-version2.sh
diff --git a/test/elf/symbol-version3.sh b/test/symbol-version3.sh
similarity index 100%
rename from test/elf/symbol-version3.sh
rename to test/symbol-version3.sh
diff --git a/test/elf/symbol-version4.sh b/test/symbol-version4.sh
similarity index 100%
rename from test/elf/symbol-version4.sh
rename to test/symbol-version4.sh
diff --git a/test/elf/symtab-dso.sh b/test/symtab-dso.sh
similarity index 100%
rename from test/elf/symtab-dso.sh
rename to test/symtab-dso.sh
diff --git a/test/elf/symtab-section-symbols.sh b/test/symtab-section-symbols.sh
similarity index 100%
rename from test/elf/symtab-section-symbols.sh
rename to test/symtab-section-symbols.sh
diff --git a/test/elf/symtab.sh b/test/symtab.sh
similarity index 100%
rename from test/elf/symtab.sh
rename to test/symtab.sh
diff --git a/test/elf/synthetic-symbols.sh b/test/synthetic-symbols.sh
similarity index 100%
rename from test/elf/synthetic-symbols.sh
rename to test/synthetic-symbols.sh
diff --git a/test/elf/sysroot-linker-script.sh b/test/sysroot-linker-script.sh
similarity index 100%
rename from test/elf/sysroot-linker-script.sh
rename to test/sysroot-linker-script.sh
diff --git a/test/elf/sysroot.sh b/test/sysroot.sh
similarity index 100%
rename from test/elf/sysroot.sh
rename to test/sysroot.sh
diff --git a/test/elf/sysroot2.sh b/test/sysroot2.sh
similarity index 100%
rename from test/elf/sysroot2.sh
rename to test/sysroot2.sh
diff --git a/test/elf/tail-call.sh b/test/tail-call.sh
similarity index 100%
rename from test/elf/tail-call.sh
rename to test/tail-call.sh
diff --git a/test/elf/tbss-only.sh b/test/tbss-only.sh
similarity index 100%
rename from test/elf/tbss-only.sh
rename to test/tbss-only.sh
diff --git a/test/elf/thin-archive.sh b/test/thin-archive.sh
similarity index 93%
rename from test/elf/thin-archive.sh
rename to test/thin-archive.sh
index 26a00de8..a1e7d60c 100755
--- a/test/elf/thin-archive.sh
+++ b/test/thin-archive.sh
@@ -31,7 +31,7 @@ rm -f $t/d.a
 $CC -B. -Wl,--trace -o $t/exe $t/d.o $t/d.a > $t/log
 
 grep -Eq 'thin-archive/d.a\(.*long-long-long-filename.o\)' $t/log
-grep -Eq 'thin-archive/d.a\(.*/b.o\)' $t/log
+grep -Eq 'thin-archive/d.a\((.*/)?b.o\)' $t/log
 grep -Fq thin-archive/d.o $t/log
 
 $QEMU $t/exe | grep -q 15
diff --git a/test/elf/thread-count.sh b/test/thread-count.sh
similarity index 100%
rename from test/elf/thread-count.sh
rename to test/thread-count.sh
diff --git a/test/elf/tls-alignment-multi.sh b/test/tls-alignment-multi.sh
similarity index 100%
rename from test/elf/tls-alignment-multi.sh
rename to test/tls-alignment-multi.sh
diff --git a/test/elf/tls-common.sh b/test/tls-common.sh
similarity index 100%
rename from test/elf/tls-common.sh
rename to test/tls-common.sh
diff --git a/test/elf/tls-df-static-tls.sh b/test/tls-df-static-tls.sh
similarity index 100%
rename from test/elf/tls-df-static-tls.sh
rename to test/tls-df-static-tls.sh
diff --git a/test/elf/tls-dso.sh b/test/tls-dso.sh
similarity index 100%
rename from test/elf/tls-dso.sh
rename to test/tls-dso.sh
diff --git a/test/elf/tls-gd-dlopen.sh b/test/tls-gd-dlopen.sh
similarity index 100%
rename from test/elf/tls-gd-dlopen.sh
rename to test/tls-gd-dlopen.sh
diff --git a/test/elf/tls-gd-noplt.sh b/test/tls-gd-noplt.sh
similarity index 100%
rename from test/elf/tls-gd-noplt.sh
rename to test/tls-gd-noplt.sh
diff --git a/test/elf/tls-gd-to-ie.sh b/test/tls-gd-to-ie.sh
similarity index 100%
rename from test/elf/tls-gd-to-ie.sh
rename to test/tls-gd-to-ie.sh
diff --git a/test/elf/tls-gd.sh b/test/tls-gd.sh
similarity index 100%
rename from test/elf/tls-gd.sh
rename to test/tls-gd.sh
diff --git a/test/elf/tls-ie.sh b/test/tls-ie.sh
similarity index 100%
rename from test/elf/tls-ie.sh
rename to test/tls-ie.sh
diff --git a/test/elf/tls-irregular-start-addr.sh b/test/tls-irregular-start-addr.sh
similarity index 100%
rename from test/elf/tls-irregular-start-addr.sh
rename to test/tls-irregular-start-addr.sh
diff --git a/test/elf/tls-large-alignment.sh b/test/tls-large-alignment.sh
similarity index 100%
rename from test/elf/tls-large-alignment.sh
rename to test/tls-large-alignment.sh
diff --git a/test/elf/tls-large-static-image.sh b/test/tls-large-static-image.sh
similarity index 100%
rename from test/elf/tls-large-static-image.sh
rename to test/tls-large-static-image.sh
diff --git a/test/elf/tls-ld-noplt.sh b/test/tls-ld-noplt.sh
similarity index 100%
rename from test/elf/tls-ld-noplt.sh
rename to test/tls-ld-noplt.sh
diff --git a/test/elf/tls-ld.sh b/test/tls-ld.sh
similarity index 100%
rename from test/elf/tls-ld.sh
rename to test/tls-ld.sh
diff --git a/test/elf/tls-le-error.sh b/test/tls-le-error.sh
similarity index 100%
rename from test/elf/tls-le-error.sh
rename to test/tls-le-error.sh
diff --git a/test/elf/tls-le.sh b/test/tls-le.sh
similarity index 78%
rename from test/elf/tls-le.sh
rename to test/tls-le.sh
index 33e13411..502c73b0 100755
--- a/test/elf/tls-le.sh
+++ b/test/tls-le.sh
@@ -22,8 +22,8 @@ cat <<EOF | $GCC -fPIC -c -o $t/b.o -xc -
 __attribute__((tls_model("local-exec"))) _Thread_local int foo = 3;
 EOF
 
-$CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '3 5 3 5'
+$CC -B. -o $t/exe1 $t/a.o $t/b.o
+$QEMU $t/exe1 | grep -q '3 5 3 5'
 
-$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-no-relax
-$QEMU $t/exe | grep -q '3 5 3 5'
+$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax
+$QEMU $t/exe2 | grep -q '3 5 3 5'
diff --git a/test/elf/tls-nopic.sh b/test/tls-nopic.sh
similarity index 100%
rename from test/elf/tls-nopic.sh
rename to test/tls-nopic.sh
diff --git a/test/elf/tls-pic.sh b/test/tls-pic.sh
similarity index 100%
rename from test/elf/tls-pic.sh
rename to test/tls-pic.sh
diff --git a/test/elf/tls-small-alignment.sh b/test/tls-small-alignment.sh
similarity index 100%
rename from test/elf/tls-small-alignment.sh
rename to test/tls-small-alignment.sh
diff --git a/test/elf/tlsdesc-dlopen.sh b/test/tlsdesc-dlopen.sh
similarity index 100%
rename from test/elf/tlsdesc-dlopen.sh
rename to test/tlsdesc-dlopen.sh
diff --git a/test/elf/tlsdesc-import.sh b/test/tlsdesc-import.sh
similarity index 100%
rename from test/elf/tlsdesc-import.sh
rename to test/tlsdesc-import.sh
diff --git a/test/elf/tlsdesc-initial-exec.sh b/test/tlsdesc-initial-exec.sh
similarity index 72%
rename from test/elf/tlsdesc-initial-exec.sh
rename to test/tlsdesc-initial-exec.sh
index 3911a227..aabcdae5 100755
--- a/test/elf/tlsdesc-initial-exec.sh
+++ b/test/tlsdesc-initial-exec.sh
@@ -29,11 +29,11 @@ EOF
 $CC -B. -o $t/exe1 $t/c.o $t/d.o $t/b.so
 $QEMU $t/exe1 | grep -q '^5 5 5$'
 
-readelf -Wr $t/exe1 > $t/log1
-! grep -Eq 'TLS.?DESC' $t/log1 || false
+$OBJDUMP --dynamic-reloc $t/exe1 > $t/log1
+! grep -Eq 'TLS_?DESC' $t/log1 || false
 
-$CC -B. -o $t/exe1 $t/c.o $t/d.o $t/b.so -Wl,--no-relax
-$QEMU $t/exe1 | grep -q '^5 5 5$'
+$CC -B. -o $t/exe2 $t/c.o $t/d.o $t/b.so -Wl,--no-relax
+$QEMU $t/exe2 | grep -q '^5 5 5$'
 
-readelf -Wr $t/exe1 > $t/log2
-grep -Eq 'TLS.?DESC' $t/log2
+$OBJDUMP --dynamic-reloc $t/exe2 > $t/log2
+grep -Eq 'TLS_?DESC' $t/log2
diff --git a/test/elf/tlsdesc-local-dynamic.sh b/test/tlsdesc-local-dynamic.sh
similarity index 100%
rename from test/elf/tlsdesc-local-dynamic.sh
rename to test/tlsdesc-local-dynamic.sh
diff --git a/test/elf/tlsdesc-static.sh b/test/tlsdesc-static.sh
similarity index 100%
rename from test/elf/tlsdesc-static.sh
rename to test/tlsdesc-static.sh
diff --git a/test/elf/tlsdesc.sh b/test/tlsdesc.sh
similarity index 100%
rename from test/elf/tlsdesc.sh
rename to test/tlsdesc.sh
diff --git a/test/elf/trace-symbol-symver.sh b/test/trace-symbol-symver.sh
similarity index 100%
rename from test/elf/trace-symbol-symver.sh
rename to test/trace-symbol-symver.sh
diff --git a/test/elf/trace-symbol.sh b/test/trace-symbol.sh
similarity index 100%
rename from test/elf/trace-symbol.sh
rename to test/trace-symbol.sh
diff --git a/test/elf/trace.sh b/test/trace.sh
similarity index 100%
rename from test/elf/trace.sh
rename to test/trace.sh
diff --git a/test/elf/undefined-glob-gc-sections.sh b/test/undefined-glob-gc-sections.sh
similarity index 100%
rename from test/elf/undefined-glob-gc-sections.sh
rename to test/undefined-glob-gc-sections.sh
diff --git a/test/elf/undefined-glob.sh b/test/undefined-glob.sh
similarity index 100%
rename from test/elf/undefined-glob.sh
rename to test/undefined-glob.sh
diff --git a/test/elf/undefined.sh b/test/undefined.sh
similarity index 100%
rename from test/elf/undefined.sh
rename to test/undefined.sh
diff --git a/test/elf/undefined2.sh b/test/undefined2.sh
similarity index 100%
rename from test/elf/undefined2.sh
rename to test/undefined2.sh
diff --git a/test/elf/unkown-section-type.sh b/test/unkown-section-type.sh
similarity index 100%
rename from test/elf/unkown-section-type.sh
rename to test/unkown-section-type.sh
diff --git a/test/elf/unresolved-symbols.sh b/test/unresolved-symbols.sh
similarity index 100%
rename from test/elf/unresolved-symbols.sh
rename to test/unresolved-symbols.sh
diff --git a/test/elf/unresolved-symbols2.sh b/test/unresolved-symbols2.sh
similarity index 100%
rename from test/elf/unresolved-symbols2.sh
rename to test/unresolved-symbols2.sh
diff --git a/test/elf/verbose.sh b/test/verbose.sh
similarity index 100%
rename from test/elf/verbose.sh
rename to test/verbose.sh
diff --git a/test/elf/version-script-search-paths.sh b/test/version-script-search-paths.sh
similarity index 100%
rename from test/elf/version-script-search-paths.sh
rename to test/version-script-search-paths.sh
diff --git a/test/elf/version-script.sh b/test/version-script.sh
similarity index 100%
rename from test/elf/version-script.sh
rename to test/version-script.sh
diff --git a/test/elf/version-script10.sh b/test/version-script10.sh
similarity index 100%
rename from test/elf/version-script10.sh
rename to test/version-script10.sh
diff --git a/test/elf/version-script11.sh b/test/version-script11.sh
similarity index 100%
rename from test/elf/version-script11.sh
rename to test/version-script11.sh
diff --git a/test/elf/version-script12.sh b/test/version-script12.sh
similarity index 100%
rename from test/elf/version-script12.sh
rename to test/version-script12.sh
diff --git a/test/elf/version-script13.sh b/test/version-script13.sh
similarity index 100%
rename from test/elf/version-script13.sh
rename to test/version-script13.sh
diff --git a/test/elf/version-script14.sh b/test/version-script14.sh
similarity index 100%
rename from test/elf/version-script14.sh
rename to test/version-script14.sh
diff --git a/test/elf/version-script15.sh b/test/version-script15.sh
similarity index 100%
rename from test/elf/version-script15.sh
rename to test/version-script15.sh
diff --git a/test/elf/version-script16.sh b/test/version-script16.sh
similarity index 100%
rename from test/elf/version-script16.sh
rename to test/version-script16.sh
diff --git a/test/elf/version-script17.sh b/test/version-script17.sh
similarity index 100%
rename from test/elf/version-script17.sh
rename to test/version-script17.sh
diff --git a/test/elf/version-script18.sh b/test/version-script18.sh
similarity index 100%
rename from test/elf/version-script18.sh
rename to test/version-script18.sh
diff --git a/test/elf/version-script19.sh b/test/version-script19.sh
similarity index 100%
rename from test/elf/version-script19.sh
rename to test/version-script19.sh
diff --git a/test/elf/version-script2.sh b/test/version-script2.sh
similarity index 100%
rename from test/elf/version-script2.sh
rename to test/version-script2.sh
diff --git a/test/elf/version-script20.sh b/test/version-script20.sh
similarity index 100%
rename from test/elf/version-script20.sh
rename to test/version-script20.sh
diff --git a/test/elf/version-script21.sh b/test/version-script21.sh
similarity index 100%
rename from test/elf/version-script21.sh
rename to test/version-script21.sh
diff --git a/test/elf/version-script22.sh b/test/version-script22.sh
similarity index 100%
rename from test/elf/version-script22.sh
rename to test/version-script22.sh
diff --git a/test/elf/version-script23.sh b/test/version-script23.sh
similarity index 100%
rename from test/elf/version-script23.sh
rename to test/version-script23.sh
diff --git a/test/elf/version-script3.sh b/test/version-script3.sh
similarity index 100%
rename from test/elf/version-script3.sh
rename to test/version-script3.sh
diff --git a/test/elf/version-script4.sh b/test/version-script4.sh
similarity index 100%
rename from test/elf/version-script4.sh
rename to test/version-script4.sh
diff --git a/test/elf/version-script5.sh b/test/version-script5.sh
similarity index 100%
rename from test/elf/version-script5.sh
rename to test/version-script5.sh
diff --git a/test/elf/version-script6.sh b/test/version-script6.sh
similarity index 100%
rename from test/elf/version-script6.sh
rename to test/version-script6.sh
diff --git a/test/elf/version-script7.sh b/test/version-script7.sh
similarity index 100%
rename from test/elf/version-script7.sh
rename to test/version-script7.sh
diff --git a/test/elf/version-script8.sh b/test/version-script8.sh
similarity index 100%
rename from test/elf/version-script8.sh
rename to test/version-script8.sh
diff --git a/test/elf/version-script9.sh b/test/version-script9.sh
similarity index 100%
rename from test/elf/version-script9.sh
rename to test/version-script9.sh
diff --git a/test/elf/version.sh b/test/version.sh
similarity index 100%
rename from test/elf/version.sh
rename to test/version.sh
diff --git a/test/elf/versioned-undef.sh b/test/versioned-undef.sh
similarity index 100%
rename from test/elf/versioned-undef.sh
rename to test/versioned-undef.sh
diff --git a/test/elf/visibility.sh b/test/visibility.sh
similarity index 100%
rename from test/elf/visibility.sh
rename to test/visibility.sh
diff --git a/test/elf/warn-common.sh b/test/warn-common.sh
similarity index 100%
rename from test/elf/warn-common.sh
rename to test/warn-common.sh
diff --git a/test/elf/warn-once.sh b/test/warn-once.sh
similarity index 83%
rename from test/elf/warn-once.sh
rename to test/warn-once.sh
index 44ab16a3..852fe0e2 100755
--- a/test/elf/warn-once.sh
+++ b/test/warn-once.sh
@@ -14,4 +14,4 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,--warn-unresolved-symbols,--warn-once >& $t/log
 
-[ "$(grep 'undefined symbol:.* foo$' $t/log | wc -l)" = 1 ]
+[ $(grep 'undefined symbol:.* foo$' $t/log | wc -l) = 1 ]
diff --git a/test/elf/warn-symbol-type.sh b/test/warn-symbol-type.sh
similarity index 100%
rename from test/elf/warn-symbol-type.sh
rename to test/warn-symbol-type.sh
diff --git a/test/elf/warn-unresolved-symbols.sh b/test/warn-unresolved-symbols.sh
similarity index 100%
rename from test/elf/warn-unresolved-symbols.sh
rename to test/warn-unresolved-symbols.sh
diff --git a/test/elf/weak-export-dso.sh b/test/weak-export-dso.sh
similarity index 100%
rename from test/elf/weak-export-dso.sh
rename to test/weak-export-dso.sh
diff --git a/test/elf/weak-export-dso2.sh b/test/weak-export-dso2.sh
similarity index 100%
rename from test/elf/weak-export-dso2.sh
rename to test/weak-export-dso2.sh
diff --git a/test/elf/weak-export-exe.sh b/test/weak-export-exe.sh
similarity index 100%
rename from test/elf/weak-export-exe.sh
rename to test/weak-export-exe.sh
diff --git a/test/elf/weak-undef-dso.sh b/test/weak-undef-dso.sh
similarity index 100%
rename from test/elf/weak-undef-dso.sh
rename to test/weak-undef-dso.sh
diff --git a/test/elf/weak-undef.sh b/test/weak-undef.sh
similarity index 100%
rename from test/elf/weak-undef.sh
rename to test/weak-undef.sh
diff --git a/test/elf/weak-undef2.sh b/test/weak-undef2.sh
similarity index 100%
rename from test/elf/weak-undef2.sh
rename to test/weak-undef2.sh
diff --git a/test/elf/weak-undef4.sh b/test/weak-undef4.sh
similarity index 100%
rename from test/elf/weak-undef4.sh
rename to test/weak-undef4.sh
diff --git a/test/elf/weak-undef5.sh b/test/weak-undef5.sh
similarity index 100%
rename from test/elf/weak-undef5.sh
rename to test/weak-undef5.sh
diff --git a/test/elf/whole-archive.sh b/test/whole-archive.sh
similarity index 61%
rename from test/elf/whole-archive.sh
rename to test/whole-archive.sh
index de5da115..721acf37 100755
--- a/test/elf/whole-archive.sh
+++ b/test/whole-archive.sh
@@ -14,19 +14,19 @@ ar cr $t/d.a $t/b.o $t/c.o
 
 $CC -B. -nostdlib -o $t/exe $t/a.o $t/d.a
 
-readelf --symbols $t/exe > $t/readelf
-! grep -q fn1 $t/readelf || false
-! grep -q fn2 $t/readelf || false
+readelf --symbols $t/exe > $t/log
+! grep -q fn1 $t/log || false
+! grep -q fn2 $t/log || false
 
 $CC -B. -nostdlib -o $t/exe $t/a.o -Wl,--whole-archive $t/d.a
 
-readelf --symbols $t/exe > $t/readelf
-grep -q fn1 $t/readelf
-grep -q fn2 $t/readelf
+readelf --symbols $t/exe > $t/log
+grep -q fn1 $t/log
+grep -q fn2 $t/log
 
 $CC -B. -nostdlib -o $t/exe $t/a.o -Wl,--whole-archive \
   -Wl,--no-whole-archive $t/d.a
 
-readelf --symbols $t/exe > $t/readelf
-! grep -q fn1 $t/readelf || false
-! grep -q fn2 $t/readelf || false
+readelf --symbols $t/exe > $t/log
+! grep -q fn1 $t/log || false
+! grep -q fn2 $t/log || false
diff --git a/test/elf/wrap-lto.sh b/test/wrap-lto.sh
similarity index 96%
rename from test/elf/wrap-lto.sh
rename to test/wrap-lto.sh
index 1e26af8c..0e2fb52b 100755
--- a/test/elf/wrap-lto.sh
+++ b/test/wrap-lto.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
+test_cflags -flto || skip
+
 cat <<EOF | $CC -fPIC -shared -o $t/a.so -xc -
 #include <stdio.h>
 
diff --git a/test/elf/wrap.sh b/test/wrap.sh
similarity index 100%
rename from test/elf/wrap.sh
rename to test/wrap.sh
diff --git a/test/elf/z-cet-report.sh b/test/z-cet-report.sh
similarity index 100%
rename from test/elf/z-cet-report.sh
rename to test/z-cet-report.sh
diff --git a/test/elf/z-defs.sh b/test/z-defs.sh
similarity index 100%
rename from test/elf/z-defs.sh
rename to test/z-defs.sh
diff --git a/test/elf/z-dynamic-undefined-weak.sh b/test/z-dynamic-undefined-weak.sh
similarity index 100%
rename from test/elf/z-dynamic-undefined-weak.sh
rename to test/z-dynamic-undefined-weak.sh
diff --git a/test/elf/z-max-page-size.sh b/test/z-max-page-size.sh
similarity index 100%
rename from test/elf/z-max-page-size.sh
rename to test/z-max-page-size.sh
diff --git a/test/elf/z-nodefaultlib.sh b/test/z-nodefaultlib.sh
similarity index 100%
rename from test/elf/z-nodefaultlib.sh
rename to test/z-nodefaultlib.sh
diff --git a/test/elf/z-nodump.sh b/test/z-nodump.sh
similarity index 100%
rename from test/elf/z-nodump.sh
rename to test/z-nodump.sh
diff --git a/test/elf/z-now.sh b/test/z-now.sh
similarity index 100%
rename from test/elf/z-now.sh
rename to test/z-now.sh
diff --git a/test/elf/z-origin.sh b/test/z-origin.sh
similarity index 100%
rename from test/elf/z-origin.sh
rename to test/z-origin.sh
diff --git a/test/z-pack-relative-relocs.sh b/test/z-pack-relative-relocs.sh
new file mode 100755
index 00000000..357bb859
--- /dev/null
+++ b/test/z-pack-relative-relocs.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -fPIC -c -xc -
+#include <stdio.h>
+int main() {
+  printf("Hello world\n");
+}
+EOF
+
+$CC -o $t/exe1 $t/a.o -pie -Wl,-z,pack-relative-relocs 2> /dev/null || skip
+readelf -WS $t/exe1 | grep -Fq .relr.dyn || skip
+$QEMU $t/exe1 2> /dev/null | grep -q Hello || skip
+
+$CC -B. -o $t/exe2 $t/a.o -pie -Wl,-z,pack-relative-relocs
+$QEMU $t/exe2 | grep -q Hello
+
+readelf --dynamic $t/exe2 > $t/log2
+grep -wq RELR $t/log2
+grep -wq RELRSZ $t/log2
+grep -wq RELRENT $t/log2
diff --git a/test/elf/z-rodynamic.sh b/test/z-rodynamic.sh
similarity index 100%
rename from test/elf/z-rodynamic.sh
rename to test/z-rodynamic.sh
diff --git a/test/elf/z-sectionheader.sh b/test/z-sectionheader.sh
similarity index 100%
rename from test/elf/z-sectionheader.sh
rename to test/z-sectionheader.sh
diff --git a/test/elf/z-separate-code.sh b/test/z-separate-code.sh
similarity index 100%
rename from test/elf/z-separate-code.sh
rename to test/z-separate-code.sh
diff --git a/test/elf/z-stack-size.sh b/test/z-stack-size.sh
similarity index 100%
rename from test/elf/z-stack-size.sh
rename to test/z-stack-size.sh
diff --git a/test/elf/z-start-stop-visibility.sh b/test/z-start-stop-visibility.sh
similarity index 100%
rename from test/elf/z-start-stop-visibility.sh
rename to test/z-start-stop-visibility.sh
diff --git a/test/elf/z-unknown.sh b/test/z-unknown.sh
similarity index 100%
rename from test/elf/z-unknown.sh
rename to test/z-unknown.sh
diff --git a/third-party/mimalloc/.gitattributes b/third-party/mimalloc/.gitattributes
index f083b107..0332e031 100644
--- a/third-party/mimalloc/.gitattributes
+++ b/third-party/mimalloc/.gitattributes
@@ -10,4 +10,3 @@
 *.dll binary
 *.lib binary
 *.exe binary
-bin export-ignore
diff --git a/third-party/mimalloc/.gitignore b/third-party/mimalloc/.gitignore
index f8b7f5eb..df1d58eb 100644
--- a/third-party/mimalloc/.gitignore
+++ b/third-party/mimalloc/.gitignore
@@ -7,3 +7,5 @@ ide/vs20??/VTune*
 out/
 docs/
 *.zip
+*.tar
+*.gz
diff --git a/third-party/mimalloc/CMakeLists.txt b/third-party/mimalloc/CMakeLists.txt
index 2cc2fc46..bcfe91d8 100644
--- a/third-party/mimalloc/CMakeLists.txt
+++ b/third-party/mimalloc/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(libmimalloc C CXX)
 
 set(CMAKE_C_STANDARD 11)
@@ -35,6 +35,7 @@ option(MI_NO_THP            "Disable transparent huge pages support on Linux/And
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
 option(MI_USE_LIBATOMIC     "Explicitly link with -latomic (on older systems) (deprecated and detected automatically)" OFF)
 
+include(CheckLinkerFlag)    # requires cmake 3.18
 include(CheckIncludeFiles)
 include(GNUInstallDirs)
 include("cmake/mimalloc-config-version.cmake")
@@ -338,29 +339,45 @@ if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
   list(APPEND mi_cflags /Zc:__cplusplus)
 endif()
 
+if(MINGW)
+  add_definitions(-D_WIN32_WINNT=0x600)
+endif()
+
 # extra needed libraries
+
+# we prefer -l<lib> test over `find_library` as sometimes core libraries 
+# like `libatomic` are not on the system path (see issue #898)
+function(find_link_library libname outlibname)  
+  check_linker_flag(C "-l${libname}" mi_has_lib${libname})  
+  if (mi_has_lib${libname})
+    message(VERBOSE "link library: -l${libname}")
+    set(${outlibname} ${libname} PARENT_SCOPE)    
+  else()
+    find_library(MI_LIBPATH libname)
+    if (MI_LIBPATH)
+      message(VERBOSE "link library ${libname} at ${MI_LIBPATH}")
+      set(${outlibname} ${MI_LIBPATH} PARENT_SCOPE)      
+    else()
+      message(VERBOSE "link library not found: ${libname}")
+      set(${outlibname} "" PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
 if(WIN32)
-  list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)
-  set(pc_libraries "-lpsapi -lshell32 -luser32 -ladvapi32 -lbcrypt")
+  list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)  
 else()
-  set(pc_libraries "")
-  find_library(MI_LIBPTHREAD pthread)
-  if (MI_LIBPTHREAD)
-    list(APPEND mi_libraries ${MI_LIBPTHREAD})
-    set(pc_libraries "${pc_libraries} -pthread")
-  endif()
-  find_library(MI_LIBRT rt)
-  if(MI_LIBRT)
-    list(APPEND mi_libraries ${MI_LIBRT})
-    set(pc_libraries "${pc_libraries} -lrt")
+  find_link_library("pthread" MI_LIB_PTHREAD)
+  if(MI_LIB_PTHREAD) 
+    list(APPEND mi_libraries "${MI_LIB_PTHREAD}")
   endif()
-  find_library(MI_LIBATOMIC atomic)
-  if (NOT MI_LIBATOMIC AND MI_USE_LIBATOMIC)
-    set(MI_LIBATOMIC atomic)
+  find_link_library("rt" MI_LIB_RT)
+  if(MI_LIB_RT) 
+    list(APPEND mi_libraries "${MI_LIB_RT}")
   endif()
-  if (MI_LIBATOMIC)
-    list(APPEND mi_libraries ${MI_LIBATOMIC})
-    set(pc_libraries "${pc_libraries} -latomic")
+  find_link_library("atomic" MI_LIB_ATOMIC)
+  if(MI_LIB_ATOMIC) 
+    list(APPEND mi_libraries "${MI_LIB_ATOMIC}")  
   endif()
 endif()
 
@@ -369,7 +386,8 @@ endif()
 # -----------------------------------------------------------------------------
 
 # dynamic/shared library and symlinks always go to /usr/local/lib equivalent
-set(mi_install_libdir   "${CMAKE_INSTALL_LIBDIR}")
+set(mi_install_libdir       "${CMAKE_INSTALL_LIBDIR}")
+set(mi_install_bindir       "${CMAKE_INSTALL_BINDIR}")
 
 # static libraries and object files, includes, and cmake config files
 # are either installed at top level, or use versioned directories for side-by-side installation (default)
@@ -453,10 +471,10 @@ if(MI_BUILD_SHARED)
     add_custom_command(TARGET mimalloc POST_BUILD
       COMMAND "${CMAKE_COMMAND}" -E copy "${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" $<TARGET_FILE_DIR:mimalloc>
       COMMENT "Copy mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll to output directory")
-    install(FILES "$<TARGET_FILE_DIR:mimalloc>/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" DESTINATION ${mi_install_libdir})
+    install(FILES "$<TARGET_FILE_DIR:mimalloc>/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" DESTINATION ${mi_install_bindir})
   endif()
 
-  install(TARGETS mimalloc EXPORT mimalloc DESTINATION ${mi_install_libdir} LIBRARY)
+  install(TARGETS mimalloc EXPORT mimalloc ARCHIVE DESTINATION ${mi_install_libdir} RUNTIME DESTINATION ${mi_install_bindir} LIBRARY DESTINATION ${mi_install_libdir})
   install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir})
 endif()
 
@@ -522,6 +540,15 @@ if (MI_BUILD_OBJECT)
 endif()
 
 # pkg-config file support
+set(pc_libraries "")
+foreach(item IN LISTS mi_libraries)
+  if(item MATCHES " *[-].*")
+    set(pc_libraries "${pc_libraries} ${item}")
+  else()
+    set(pc_libraries "${pc_libraries} -l${item}")
+  endif()
+endforeach()
+
 include("cmake/JoinPaths.cmake")
 join_paths(includedir_for_pc_file "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}")
 join_paths(libdir_for_pc_file "\${prefix}" "${CMAKE_INSTALL_LIBDIR}")
@@ -530,6 +557,8 @@ configure_file(mimalloc.pc.in mimalloc.pc @ONLY)
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mimalloc.pc"
         DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/")
 
+
+
 # -----------------------------------------------------------------------------
 # API surface testing
 # -----------------------------------------------------------------------------
diff --git a/third-party/mimalloc/bin/mimalloc-redirect.lib b/third-party/mimalloc/bin/mimalloc-redirect.lib
new file mode 100644
index 00000000..de128bb9
Binary files /dev/null and b/third-party/mimalloc/bin/mimalloc-redirect.lib differ
diff --git a/third-party/mimalloc/bin/mimalloc-redirect32.lib b/third-party/mimalloc/bin/mimalloc-redirect32.lib
new file mode 100644
index 00000000..87f19b8e
Binary files /dev/null and b/third-party/mimalloc/bin/mimalloc-redirect32.lib differ
diff --git a/third-party/mimalloc/bin/readme.md b/third-party/mimalloc/bin/readme.md
new file mode 100644
index 00000000..9b121bda
--- /dev/null
+++ b/third-party/mimalloc/bin/readme.md
@@ -0,0 +1,71 @@
+# Windows Override
+
+<span id="override_on_windows">Dynamically overriding on mimalloc on Windows</span> 
+is robust and has the particular advantage to be able to redirect all malloc/free calls that go through
+the (dynamic) C runtime allocator, including those from other DLL's or libraries.
+As it intercepts all allocation calls on a low level, it can be used reliably 
+on large programs that include other 3rd party components.
+There are four requirements to make the overriding work robustly:
+
+1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
+
+2. Link your program explicitly with `mimalloc-override.dll` library.
+   To ensure the `mimalloc-override.dll` is loaded at run-time it is easiest to insert some
+    call to the mimalloc API in the `main` function, like `mi_version()`
+    (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
+    for an example on how to use this. 
+
+3. The `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be put
+   in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency of that DLL).
+   The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
+   mimalloc functions (which reside in `mimalloc-override.dll`).
+
+4. Ensure the `mimalloc-override.dll` comes as early as possible in the import
+   list of the final executable (so it can intercept all potential allocations).
+
+For best performance on Windows with C++, it
+is also recommended to also override the `new`/`delete` operations (by including
+[`mimalloc-new-delete.h`](../include/mimalloc-new-delete.h) 
+a single(!) source file in your project).
+
+The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
+
+## Minject
+
+We cannot always re-link an executable with `mimalloc-override.dll`, and similarly, we cannot always
+ensure the the DLL comes first in the import table of the final executable.
+In many cases though we can patch existing executables without any recompilation
+if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
+into the import table (and put `mimalloc-redirect.dll` in the same folder)
+Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388).
+
+The `minject` program can also do this from the command line, use `minject --help` for options:
+
+```
+> minject --help
+
+minject:
+  Injects the mimalloc dll into the import table of a 64-bit executable,
+  and/or ensures that it comes first in het import table.
+
+usage:
+  > minject [options] <exe>
+
+options:
+  -h   --help        show this help
+  -v   --verbose     be verbose
+  -l   --list        only list imported modules
+  -i   --inplace     update the exe in-place (make sure there is a backup!)
+  -f   --force       always overwrite without prompting
+       --postfix=<p> use <p> as a postfix to the mimalloc dll (default is 'override')
+                     e.g. use --postfix=override-debug to link with mimalloc-override-debug.dll
+
+notes:
+  Without '--inplace' an injected <exe> is generated with the same name ending in '-mi'.
+  Ensure 'mimalloc-redirect.dll' is in the same folder as the mimalloc dll.
+
+examples:
+  > minject --list myprogram.exe
+  > minject --force --inplace myprogram.exe
+```  
diff --git a/third-party/mimalloc/cmake/mimalloc-config-version.cmake b/third-party/mimalloc/cmake/mimalloc-config-version.cmake
index 9b19b56b..81fd3c9d 100644
--- a/third-party/mimalloc/cmake/mimalloc-config-version.cmake
+++ b/third-party/mimalloc/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
 set(mi_version_major 2)
 set(mi_version_minor 1)
-set(mi_version_patch 6)
+set(mi_version_patch 7)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile b/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile
new file mode 100644
index 00000000..56f071db
--- /dev/null
+++ b/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile
@@ -0,0 +1,28 @@
+# install from an image
+# download first an appropiate tar.gz image into the current directory 
+# from: <https://github.com/alpinelinux/docker-alpine/tree/edge/armv7>
+FROM scratch
+
+# Substitute the image name that was downloaded
+ADD alpine-minirootfs-20240329-armv7.tar.gz /    
+
+# Install tools
+RUN apk add build-base make cmake
+RUN apk add git
+RUN apk add vim
+
+RUN mkdir -p  /home/dev
+WORKDIR /home/dev
+
+# Get mimalloc
+RUN git clone https://github.com/microsoft/mimalloc -b dev-slice
+RUN mkdir -p mimalloc/out/release
+RUN mkdir -p mimalloc/out/debug
+
+# Build mimalloc debug
+WORKDIR /home/dev/mimalloc/out/debug
+RUN cmake ../.. -DMI_DEBUG_FULL=ON
+RUN make -j
+RUN make test
+
+CMD ["/bin/sh"]
diff --git a/third-party/mimalloc/docker/alpine/Dockerfile b/third-party/mimalloc/docker/alpine/Dockerfile
new file mode 100644
index 00000000..b222b791
--- /dev/null
+++ b/third-party/mimalloc/docker/alpine/Dockerfile
@@ -0,0 +1,23 @@
+# alpine image  
+FROM alpine
+
+# Install tools
+RUN apk add build-base make cmake
+RUN apk add git
+RUN apk add vim
+
+RUN mkdir -p  /home/dev
+WORKDIR /home/dev
+
+# Get mimalloc
+RUN git clone https://github.com/microsoft/mimalloc -b dev-slice
+RUN mkdir -p mimalloc/out/release
+RUN mkdir -p mimalloc/out/debug
+
+# Build mimalloc debug
+WORKDIR /home/dev/mimalloc/out/debug
+RUN cmake ../.. -DMI_DEBUG_FULL=ON
+RUN make -j
+RUN make test
+
+CMD ["/bin/sh"]
\ No newline at end of file
diff --git a/third-party/mimalloc/docker/manylinux-x64/Dockerfile b/third-party/mimalloc/docker/manylinux-x64/Dockerfile
new file mode 100644
index 00000000..22d37e5a
--- /dev/null
+++ b/third-party/mimalloc/docker/manylinux-x64/Dockerfile
@@ -0,0 +1,23 @@
+FROM quay.io/pypa/manylinux2014_x86_64
+
+# Install tools
+RUN yum install -y openssl-devel
+RUN yum install -y gcc gcc-c++ kernel-devel make
+RUN yum install -y git cmake
+RUN yum install -y vim
+
+RUN mkdir -p  /home/dev
+WORKDIR /home/dev
+
+# Get mimalloc
+RUN git clone https://github.com/microsoft/mimalloc -b dev-slice
+RUN mkdir -p mimalloc/out/release
+RUN mkdir -p mimalloc/out/debug
+
+# Build mimalloc debug
+WORKDIR /home/dev/mimalloc/out/debug
+RUN cmake ../.. -DMI_DEBUG_FULL=ON
+RUN make -j
+RUN make test
+
+CMD ["/bin/sh"]
\ No newline at end of file
diff --git a/third-party/mimalloc/docker/readme.md b/third-party/mimalloc/docker/readme.md
new file mode 100644
index 00000000..b3d90094
--- /dev/null
+++ b/third-party/mimalloc/docker/readme.md
@@ -0,0 +1,10 @@
+Various example docker files used for testing.
+
+Usage:
+
+```
+> cd <host>
+> docker build -t <host>-mimalloc .
+> docker run -it <host>-mimalloc
+>> make test
+```
diff --git a/third-party/mimalloc/include/mimalloc-override.h b/third-party/mimalloc/include/mimalloc-override.h
index c63b0b91..48a8a622 100644
--- a/third-party/mimalloc/include/mimalloc-override.h
+++ b/third-party/mimalloc/include/mimalloc-override.h
@@ -24,7 +24,7 @@ not accidentally mix pointers from different allocators).
 #define free(p)                 mi_free(p)
 
 #define strdup(s)               mi_strdup(s)
-#define strndup(s,n)              mi_strndup(s,n)
+#define strndup(s,n)            mi_strndup(s,n)
 #define realpath(f,n)           mi_realpath(f,n)
 
 // Microsoft extensions
@@ -43,6 +43,7 @@ not accidentally mix pointers from different allocators).
 #define reallocf(p,n)           mi_reallocf(p,n)
 #define malloc_size(p)          mi_usable_size(p)
 #define malloc_usable_size(p)   mi_usable_size(p)
+#define malloc_good_size(sz)    mi_malloc_good_size(sz)
 #define cfree(p)                mi_free(p)
 
 #define valloc(n)               mi_valloc(n)
diff --git a/third-party/mimalloc/include/mimalloc.h b/third-party/mimalloc/include/mimalloc.h
index 8446d99d..c41bcc80 100644
--- a/third-party/mimalloc/include/mimalloc.h
+++ b/third-party/mimalloc/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 216   // major + 2 digits minor
+#define MI_MALLOC_VERSION 217   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -328,7 +328,7 @@ typedef enum mi_option_e {
   mi_option_allow_large_os_pages,       // allow large (2 or 4 MiB) OS pages, implies eager commit. If false, also disables THP for the process.
   mi_option_reserve_huge_os_pages,      // reserve N huge OS pages (1GiB pages) at startup
   mi_option_reserve_huge_os_pages_at,   // reserve huge OS pages at a specific NUMA node
-  mi_option_reserve_os_memory,          // reserve specified amount of OS memory in an arena at startup
+  mi_option_reserve_os_memory,          // reserve specified amount of OS memory in an arena at startup (internally, this value is in KiB; use `mi_option_get_size`)
   mi_option_deprecated_segment_cache,
   mi_option_deprecated_page_reset,
   mi_option_abandoned_page_purge,       // immediately purge delayed purges on thread termination
@@ -342,11 +342,12 @@ typedef enum mi_option_e {
   mi_option_max_warnings,               // issue at most N warning messages
   mi_option_max_segment_reclaim,        // max. percentage of the abandoned segments can be reclaimed per try (=10%)
   mi_option_destroy_on_exit,            // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe
-  mi_option_arena_reserve,              // initial memory size in KiB for arena reservation (= 1 GiB on 64-bit)
+  mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
   mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
   mi_option_purge_extend_delay,
   mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
   mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
+  mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/third-party/mimalloc/include/mimalloc/atomic.h b/third-party/mimalloc/include/mimalloc/atomic.h
index 807c4da8..d5333dd9 100644
--- a/third-party/mimalloc/include/mimalloc/atomic.h
+++ b/third-party/mimalloc/include/mimalloc/atomic.h
@@ -132,7 +132,7 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
 
 #elif defined(_MSC_VER)
 
-// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
+// Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics.
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
@@ -201,7 +201,7 @@ static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_
 #else
   uintptr_t x = *p;
   if (mo > mi_memory_order_relaxed) {
-    while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+    while (!mi_atomic_compare_exchange_weak_explicit((_Atomic(uintptr_t)*)p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
   }
   return x;
 #endif
diff --git a/third-party/mimalloc/include/mimalloc/internal.h b/third-party/mimalloc/include/mimalloc/internal.h
index 44f4cafe..6c6e5ed0 100644
--- a/third-party/mimalloc/include/mimalloc/internal.h
+++ b/third-party/mimalloc/include/mimalloc/internal.h
@@ -14,8 +14,8 @@ terms of the MIT license. A copy of the license can be found in the file
 // functions and macros.
 // --------------------------------------------------------------------------
 
-#include "mimalloc/types.h"
-#include "mimalloc/track.h"
+#include "types.h"
+#include "track.h"
 
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
@@ -88,6 +88,7 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
 mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
 void       _mi_thread_done(mi_heap_t* heap);
 void       _mi_thread_data_collect(void);
+void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
 
 // os.c
 void       _mi_os_init(void);                                            // called from process init
@@ -186,11 +187,13 @@ size_t     _mi_bin_size(uint8_t bin);           // for stats
 uint8_t    _mi_bin(size_t size);                // for stats
 
 // "heap.c"
+void       _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag);
 void       _mi_heap_destroy_pages(mi_heap_t* heap);
 void       _mi_heap_collect_abandon(mi_heap_t* heap);
 void       _mi_heap_set_default_direct(mi_heap_t* heap);
 bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
 void       _mi_heap_unsafe_destroy_all(void);
+mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
 
 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
@@ -379,10 +382,10 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 }
 #else /* __builtin_umul_overflow is unavailable */
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
-  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
+  #define MI_MUL_COULD_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
   *total = count * size;
   // note: gcc/clang optimize this to directly check the overflow flag
-  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count);
+  return ((size >= MI_MUL_COULD_OVERFLOW || count >= MI_MUL_COULD_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count);
 }
 #endif
 
@@ -546,6 +549,7 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
   mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
+  if (heap != NULL) { page->heap_tag = heap->tag; }
 }
 
 // Thread free flag helpers
diff --git a/third-party/mimalloc/include/mimalloc/prim.h b/third-party/mimalloc/include/mimalloc/prim.h
index 4d813b7f..3f4574dd 100644
--- a/third-party/mimalloc/include/mimalloc/prim.h
+++ b/third-party/mimalloc/include/mimalloc/prim.h
@@ -26,7 +26,7 @@ typedef struct mi_os_mem_config_s {
   size_t  large_page_size;      // 0 if not supported, usually 2MiB (4MiB on Windows)
   size_t  alloc_granularity;    // smallest allocation size (usually 4KiB, on Windows 64KiB)
   bool    has_overcommit;       // can we reserve more memory than can be actually committed?
-  bool    must_free_whole;      // must allocated blocks be freed as a whole (false for mmap, true for VirtualAlloc)
+  bool    has_partial_free;     // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc)
   bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
 } mi_os_mem_config_t;
 
@@ -198,7 +198,7 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
     tcb[slot] = value;
   #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
     MI_UNUSED(ofs);
-    pthread_setspecific(slot, value);    
+    pthread_setspecific(slot, value);
   #endif
 }
 
@@ -208,13 +208,18 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 // but unfortunately, it seems we cannot test for this reliably at this time (see issue #883)
 // Nevertheless, it seems needed on older graviton platforms (see issue #851).
 // For now, we only enable this for specific platforms.
-#if defined(__GNUC__) && (__GNUC__ >= 7) && defined(__aarch64__) /* special case aarch64 for older gcc versions (issue #851) */ \
-    && !defined(__APPLE__)  /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+#if !defined(__APPLE__)  /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+    && !defined(MI_LIBC_MUSL) \
     && (!defined(__clang_major__) || __clang_major__ >= 14)  /* older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>) */
-#define MI_USE_BUILTIN_THREAD_POINTER  1
+  #if    (defined(__GNUC__) && (__GNUC__ >= 7)  && defined(__aarch64__)) /* aarch64 for older gcc versions (issue #851) */ \
+      || (defined(__GNUC__) && (__GNUC__ >= 11) && defined(__x86_64__)) \
+      || (defined(__clang_major__) && (__clang_major__ >= 14) && (defined(__aarch64__) || defined(__x86_64__)))
+    #define MI_USE_BUILTIN_THREAD_POINTER  1
+  #endif
 #endif
 
 
+
 // defined in `init.c`; do not use these directly
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 extern bool _mi_process_is_initialized;             // has mi_process_init been called?
@@ -222,7 +227,13 @@ extern bool _mi_process_is_initialized;             // has mi_process_init been
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
 
 // Get a unique id for the current thread.
-#if defined(_WIN32)
+#if defined(MI_PRIM_THREAD_ID)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
+}
+
+#elif defined(_WIN32)
 
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
@@ -233,11 +244,11 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   return (uintptr_t)NtCurrentTeb();
 }
 
-#elif MI_USE_BUILTIN_THREAD_POINTER 
+#elif MI_USE_BUILTIN_THREAD_POINTER
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   // Works on most Unix based platforms with recent compilers
-  return (uintptr_t)__builtin_thread_pointer();  
+  return (uintptr_t)__builtin_thread_pointer();
 }
 
 #elif defined(MI_HAS_TLS_SLOT)
diff --git a/third-party/mimalloc/include/mimalloc/types.h b/third-party/mimalloc/include/mimalloc/types.h
index cc807ee9..2fdde904 100644
--- a/third-party/mimalloc/include/mimalloc/types.h
+++ b/third-party/mimalloc/include/mimalloc/types.h
@@ -24,7 +24,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "mimalloc/atomic.h"  // _Atomic
+#include "atomic.h"   // _Atomic
 
 #ifdef _MSC_VER
 #pragma warning(disable:4214) // bitfield is not int
@@ -319,6 +319,7 @@ typedef struct mi_page_s {
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               heap_tag;          // tag of the owning heap, used for separated heaps by object type
                                            // padding
   size_t                block_size;        // size available in each block (always `>0`)
   uint8_t*              page_start;        // start of the page area containing the blocks
@@ -538,6 +539,7 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
   mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
 };
diff --git a/third-party/mimalloc/readme.md b/third-party/mimalloc/readme.md
index 91974587..a0296b43 100644
--- a/third-party/mimalloc/readme.md
+++ b/third-party/mimalloc/readme.md
@@ -12,8 +12,8 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release tag: `v2.1.6` (2024-05-13).  
-Latest v1 tag: `v1.8.6` (2024-05-13).
+Latest release tag: `v2.1.7` (2024-05-21).  
+Latest v1 tag: `v1.8.7` (2024-05-21).
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@@ -82,6 +82,8 @@ memory usage
   and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
   (see [below](#performance)); please report if you observe any significant performance regression.
 
+* 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
+  from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
 * 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
 * 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
   Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
diff --git a/third-party/mimalloc/src/arena.c b/third-party/mimalloc/src/arena.c
index 62bea78b..648ee844 100644
--- a/third-party/mimalloc/src/arena.c
+++ b/third-party/mimalloc/src/arena.c
@@ -51,12 +51,13 @@ typedef struct mi_arena_s {
   bool     exclusive;                     // only allow allocations if specifically for this arena
   bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
   _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.
+  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
   mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
   mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
   mi_bitmap_field_t* blocks_abandoned;    // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
   mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
+  // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
 } mi_arena_t;
 
 
@@ -144,18 +145,19 @@ static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bit
 
 #define MI_ARENA_STATIC_MAX  (MI_INTPTR_SIZE*MI_KiB)  // 8 KiB on 64-bit
 
-static uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];
-static _Atomic(size_t) mi_arena_static_top;
+static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];  // must be cache aligned, see issue #895
+static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top;
 
 static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) {
   *memid = _mi_memid_none();
   if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL;
-  if ((mi_atomic_load_relaxed(&mi_arena_static_top) + size) > MI_ARENA_STATIC_MAX) return NULL;
+  const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top);
+  if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL;
 
   // try to claim space
-  if (alignment == 0) { alignment = 1; }
+  if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; }
   const size_t oversize = size + alignment - 1;
-  if (oversize > MI_ARENA_STATIC_MAX) return NULL;
+  if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL;
   const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize);
   size_t top = oldtop + oversize;
   if (top > MI_ARENA_STATIC_MAX) {
@@ -169,7 +171,7 @@ static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* m
   memid->initially_zero = true;
   const size_t start = _mi_align_up(oldtop, alignment);
   uint8_t* const p = &mi_arena_static[start];
-  _mi_memzero(p, size);
+  _mi_memzero_aligned(p, size);
   return p;
 }
 
diff --git a/third-party/mimalloc/src/heap.c b/third-party/mimalloc/src/heap.c
index 6c56edd6..e498fdb2 100644
--- a/third-party/mimalloc/src/heap.c
+++ b/third-party/mimalloc/src/heap.c
@@ -128,6 +128,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   const bool force = (collect >= MI_FORCE);
   _mi_deferred_free(heap, force);
 
+  // python/cpython#112532: we may be called from a thread that is not the owner of the heap
+  const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
+
   // note: never reclaim on collect but leave it to threads that need storage to reclaim
   const bool force_main =
     #ifdef NDEBUG
@@ -135,7 +138,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
     #else
       collect >= MI_FORCE
     #endif
-      && _mi_is_main_thread() && mi_heap_is_backing(heap) && !heap->no_reclaim;
+      && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim;
 
   if (force_main) {
     // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
@@ -164,7 +167,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
   
   // if forced, collect thread data cache on program-exit (or shared library unload)
-  if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
+  if (force && is_main_thread && mi_heap_is_backing(heap)) {
     _mi_thread_data_collect();  // collect thread data cache
   }
   
@@ -208,22 +211,33 @@ mi_heap_t* mi_heap_get_backing(void) {
   return bheap;
 }
 
-mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
-  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
-  if (heap == NULL) return NULL;
+void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = bheap->tld;
-  heap->thread_id = _mi_thread_id();
-  heap->arena_id = arena_id;
-  _mi_random_split(&bheap->random, &heap->random);
-  heap->cookie = _mi_heap_random_next(heap) | 1;
+  heap->tld = tld;
+  heap->thread_id  = _mi_thread_id();
+  heap->arena_id   = arena_id;
+  heap->no_reclaim = noreclaim;
+  heap->tag        = tag;
+  if (heap == tld->heap_backing) {
+    _mi_random_init(&heap->random);
+  }
+  else {
+    _mi_random_split(&tld->heap_backing->random, &heap->random);
+  }
+  heap->cookie  = _mi_heap_random_next(heap) | 1;
   heap->keys[0] = _mi_heap_random_next(heap);
   heap->keys[1] = _mi_heap_random_next(heap);
-  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
   // push on the thread local heaps list
   heap->next = heap->tld->heaps;
   heap->tld->heaps = heap;
+}
+
+mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
+  if (heap == NULL) return NULL;
+  // don't reclaim abandoned pages or otherwise destroy is unsafe  
+  _mi_heap_init(heap, bheap->tld, arena_id, true /* no reclaim */, 0 /* default tag */);
   return heap;
 }
 
@@ -281,6 +295,18 @@ static void mi_heap_free(mi_heap_t* heap) {
   mi_free(heap);
 }
 
+// return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
+mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag) {
+  if (heap->tag == tag) {
+    return heap;
+  }
+  for (mi_heap_t *curr = heap->tld->heaps; curr != NULL; curr = curr->next) {
+    if (curr->tag == tag) {
+      return curr;
+    }
+  }
+  return NULL;
+}
 
 /* -----------------------------------------------------------
   Heap destroy
diff --git a/third-party/mimalloc/src/init.c b/third-party/mimalloc/src/init.c
index 33161062..6f51ca89 100644
--- a/third-party/mimalloc/src/init.c
+++ b/third-party/mimalloc/src/init.c
@@ -25,6 +25,7 @@ const mi_page_t _mi_page_empty = {
   NULL,    // local_free
   0,       // used
   0,       // block size shift
+  0,       // heap tag
   0,       // block_size
   NULL,    // page_start
   #if (MI_PADDING || MI_ENCODE_FREELIST)
@@ -33,9 +34,7 @@ const mi_page_t _mi_page_empty = {
   MI_ATOMIC_VAR_INIT(0), // xthread_free
   MI_ATOMIC_VAR_INIT(0), // xheap
   NULL, NULL
-  #if MI_INTPTR_SIZE==8
   , { 0 }  // padding
-  #endif
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -124,7 +123,8 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
-  false,
+  false,            // can reclaim
+  0,                // tag
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY
 };
@@ -170,6 +170,7 @@ mi_heap_t _mi_heap_main = {
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
   false,            // can reclaim
+  0,                // tag
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY
 };
@@ -288,7 +289,7 @@ void _mi_thread_data_collect(void) {
 }
 
 // Initialize the thread local default heap, called from `mi_thread_init`
-static bool _mi_heap_init(void) {
+static bool _mi_thread_heap_init(void) {
   if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
     // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
@@ -304,26 +305,25 @@ static bool _mi_heap_init(void) {
 
     mi_tld_t*  tld = &td->tld;
     mi_heap_t* heap = &td->heap;
-    _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld));
-    _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap));
-    heap->thread_id = _mi_thread_id();
-    _mi_random_init(&heap->random);
-    heap->cookie  = _mi_heap_random_next(heap) | 1;
-    heap->keys[0] = _mi_heap_random_next(heap);
-    heap->keys[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;
-    tld->heap_backing = heap;
-    tld->heaps = heap;
-    tld->segments.stats = &tld->stats;
-    tld->segments.os = &tld->os;
-    tld->os.stats = &tld->stats;
-    _mi_heap_set_default_direct(heap);
+    _mi_tld_init(tld, heap);  // must be before `_mi_heap_init`
+    _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
+    _mi_heap_set_default_direct(heap);   
   }
   return false;
 }
 
+// initialize thread local data
+void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
+  _mi_memcpy_aligned(tld, &tld_empty, sizeof(mi_tld_t));
+  tld->heap_backing = bheap;
+  tld->heaps = NULL;
+  tld->segments.stats = &tld->stats;
+  tld->segments.os = &tld->os;
+  tld->os.stats = &tld->stats;
+}
+
 // Free the thread local default heap (called from `mi_thread_done`)
-static bool _mi_heap_done(mi_heap_t* heap) {
+static bool _mi_thread_heap_done(mi_heap_t* heap) {
   if (!mi_heap_is_initialized(heap)) return true;
 
   // reset default heap
@@ -420,7 +420,7 @@ void mi_thread_init(void) mi_attr_noexcept
   // initialize the thread local default heap
   // (this will call `_mi_heap_set_default_direct` and thus set the
   //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
-  if (_mi_heap_init()) return;  // returns true if already initialized
+  if (_mi_thread_heap_init()) return;  // returns true if already initialized
 
   _mi_stat_increase(&_mi_stats_main.threads, 1);
   mi_atomic_increment_relaxed(&thread_count);
@@ -452,7 +452,7 @@ void _mi_thread_done(mi_heap_t* heap)
   if (heap->thread_id != _mi_thread_id()) return;
 
   // abandon the thread local heap
-  if (_mi_heap_done(heap)) return;  // returns true if already ran
+  if (_mi_thread_heap_done(heap)) return;  // returns true if already ran
 }
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
diff --git a/third-party/mimalloc/src/options.c b/third-party/mimalloc/src/options.c
index fba90761..a62727dd 100644
--- a/third-party/mimalloc/src/options.c
+++ b/third-party/mimalloc/src/options.c
@@ -65,7 +65,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
   {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
-  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve OS memory in advance
+  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
   { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
   { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
   { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_purge,abandoned_page_reset) },       // reset free page memory when a thread terminates
@@ -79,19 +79,20 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
-  { 16,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
-  { 16,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
-  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments per try.
+  { 32,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
+  { 32,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
+  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
   { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
   #if (MI_INTPTR_SIZE>4)
-  { 1024L * 1024L, UNINIT, MI_OPTION(arena_reserve) },  // reserve memory N KiB at a time
+  { 1024L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
   #else
-  {  128L * 1024L, UNINIT, MI_OPTION(arena_reserve) },
+  {  128L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // =128MiB on 32-bit
   #endif
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },        // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
   { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { 0,   UNINIT, MI_OPTION(disallow_arena_alloc) },     // 1 = do not use arena's for allocation (except if using specific arena id's)
+  { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
@@ -135,8 +136,12 @@ mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long ma
 
 mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) {
   mi_assert_internal(mi_option_has_size_in_kib(option));
-  long x = mi_option_get(option);
-  return (x < 0 ? 0 : (size_t)x * MI_KiB);
+  const long x = mi_option_get(option);
+  size_t size = (x < 0 ? 0 : (size_t)x);
+  if (mi_option_has_size_in_kib(option)) {
+    size *= MI_KiB;
+  }
+  return size;
 }
 
 void mi_option_set(mi_option_t option, long value) {
@@ -479,14 +484,20 @@ static void mi_option_init(mi_option_desc_t* desc) {
     else {
       char* end = buf;
       long value = strtol(buf, &end, 10);
-      if (desc->option == mi_option_reserve_os_memory || desc->option == mi_option_arena_reserve) {
-        // this option is interpreted in KiB to prevent overflow of `long`
+      if (mi_option_has_size_in_kib(desc->option)) {
+        // this option is interpreted in KiB to prevent overflow of `long` for large allocations 
+        // (long is 32-bit on 64-bit windows, which allows for 4TiB max.)
+        size_t size = (value < 0 ? 0 : (size_t)value);
+        bool overflow = false;
         if (*end == 'K') { end++; }
-        else if (*end == 'M') { value *= MI_KiB; end++; }
-        else if (*end == 'G') { value *= MI_MiB; end++; }
-        else { value = (value + MI_KiB - 1) / MI_KiB; }
-        if (end[0] == 'I' && end[1] == 'B') { end += 2; }
-        else if (*end == 'B') { end++; }
+        else if (*end == 'M') { overflow = mi_mul_overflow(size,MI_KiB,&size); end++; }
+        else if (*end == 'G') { overflow = mi_mul_overflow(size,MI_MiB,&size); end++; }
+        else if (*end == 'T') { overflow = mi_mul_overflow(size,MI_GiB,&size); end++; }
+        else { size = (size + MI_KiB - 1) / MI_KiB; }
+        if (end[0] == 'I' && end[1] == 'B') { end += 2; } // KiB, MiB, GiB, TiB
+        else if (*end == 'B') { end++; }                  // Kb, Mb, Gb, Tb
+        if (overflow || size > MI_MAX_ALLOC_SIZE) { size = (MI_MAX_ALLOC_SIZE / MI_KiB); }
+        value = (size > LONG_MAX ? LONG_MAX : (long)size);
       }
       if (*end == 0) {
         desc->value = value;
diff --git a/third-party/mimalloc/src/os.c b/third-party/mimalloc/src/os.c
index dda6844c..ce104273 100644
--- a/third-party/mimalloc/src/os.c
+++ b/third-party/mimalloc/src/os.c
@@ -11,9 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 /* -----------------------------------------------------------
-  Initialization.
-  On windows initializes support for aligned allocation and
-  large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
+  Initialization. 
 ----------------------------------------------------------- */
 
 static mi_os_mem_config_t mi_os_mem_config = {
@@ -21,7 +19,7 @@ static mi_os_mem_config_t mi_os_mem_config = {
   0,      // large page size (usually 2MiB)
   4096,   // allocation granularity
   true,   // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
-  false,  // must free whole? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
+  false,  // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
   true    // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
 };
 
@@ -239,7 +237,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
     const size_t over_size = size + alignment;
 
-    if (mi_os_mem_config.must_free_whole) {  // win32 virtualAlloc cannot free parts of an allocate block
+    if (!mi_os_mem_config.has_partial_free) {  // win32 virtualAlloc cannot free parts of an allocated block
       // over-allocate uncommitted (virtual) memory
       p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
       if (p == NULL) return NULL;
@@ -260,7 +258,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
       if (p == NULL) return NULL;
 
-      // and selectively unmap parts around the over-allocated area. (noop on sbrk)
+      // and selectively unmap parts around the over-allocated area. 
       void* aligned_p = mi_align_up_ptr(p, alignment);
       size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
       size_t mid_size = _mi_align_up(size, _mi_os_page_size());
@@ -268,7 +266,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size);
       if (pre_size > 0)  { mi_os_prim_free(p, pre_size, commit, stats); }
       if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
-      // we can return the aligned pointer on `mmap` (and sbrk) systems
+      // we can return the aligned pointer on `mmap` systems
       p = aligned_p;
       *base = aligned_p; // since we freed the pre part, `*base == p`.
     }
diff --git a/third-party/mimalloc/src/prim/emscripten/prim.c b/third-party/mimalloc/src/prim/emscripten/prim.c
index 1f60a1bb..f3797c9e 100644
--- a/third-party/mimalloc/src/prim/emscripten/prim.c
+++ b/third-party/mimalloc/src/prim/emscripten/prim.c
@@ -51,7 +51,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config) {
   config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
   config->alloc_granularity = 16;
   config->has_overcommit = false;
-  config->must_free_whole = true;
+  config->has_partial_free = false;
   config->has_virtual_reserve = false;
 }
 
diff --git a/third-party/mimalloc/src/prim/osx/alloc-override-zone.c b/third-party/mimalloc/src/prim/osx/alloc-override-zone.c
index 9a317750..1515b886 100644
--- a/third-party/mimalloc/src/prim/osx/alloc-override-zone.c
+++ b/third-party/mimalloc/src/prim/osx/alloc-override-zone.c
@@ -422,6 +422,7 @@ __attribute__((constructor(0)))
 #else
 __attribute__((constructor))      // seems not supported by g++-11 on the M1
 #endif
+__attribute__((used))
 static void _mi_macos_override_malloc(void) {
   malloc_zone_t* purgeable_zone = NULL;
 
diff --git a/third-party/mimalloc/src/prim/unix/prim.c b/third-party/mimalloc/src/prim/unix/prim.c
index 7e4e8f7b..90a4aac2 100644
--- a/third-party/mimalloc/src/prim/unix/prim.c
+++ b/third-party/mimalloc/src/prim/unix/prim.c
@@ -29,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <unistd.h>    // sysconf
 #include <fcntl.h>     // open, close, read, access
 #include <stdlib.h>
-  
+
 #if defined(__linux__)
   #include <features.h>
   #if defined(MI_NO_THP)
@@ -58,7 +58,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/sysctl.h>
 #endif
 
-#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(__OpenBSD__) && !defined(__sun)
+#if defined(__linux__) || defined(__FreeBSD__)
   #define MI_HAS_SYSCALL_H
   #include <sys/syscall.h>
 #endif
@@ -66,39 +66,38 @@ terms of the MIT license. A copy of the license can be found in the file
 
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
-// and do allocation themselves; using syscalls prevents recursion when mimalloc is 
+// and do allocation themselves; using syscalls prevents recursion when mimalloc is
 // still initializing (issue #713)
+// Declare inline to avoid unused function warnings.
 //------------------------------------------------------------------------------------
 
-
 #if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access)
 
-static int mi_prim_open(const char* fpath, int open_flags) {
+static inline int mi_prim_open(const char* fpath, int open_flags) {
   return syscall(SYS_open,fpath,open_flags,0);
 }
-static ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
+static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
   return syscall(SYS_read,fd,buf,bufsize);
 }
-static int mi_prim_close(int fd) {
+static inline int mi_prim_close(int fd) {
   return syscall(SYS_close,fd);
 }
-static int mi_prim_access(const char *fpath, int mode) {
+static inline int mi_prim_access(const char *fpath, int mode) {
   return syscall(SYS_access,fpath,mode);
 }
 
-#elif !defined(__sun) && \
-      (!defined(__APPLE__) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_7))  // avoid unused warnings on macOS and Solaris
+#else
 
-static int mi_prim_open(const char* fpath, int open_flags) {
+static inline int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);
 }
-static ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
+static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
   return read(fd,buf,bufsize);
 }
-static int mi_prim_close(int fd) {
+static inline int mi_prim_close(int fd) {
   return close(fd);
 }
-static int mi_prim_access(const char *fpath, int mode) {
+static inline int mi_prim_access(const char *fpath, int mode) {
   return access(fpath,mode);
 }
 
@@ -131,12 +130,12 @@ static bool unix_detect_overcommit(void) {
     os_overcommit = (val != 0);
   }
 #else
-  // default: overcommit is true  
+  // default: overcommit is true
 #endif
   return os_overcommit;
 }
 
-void _mi_prim_mem_init( mi_os_mem_config_t* config ) 
+void _mi_prim_mem_init( mi_os_mem_config_t* config )
 {
   long psize = sysconf(_SC_PAGESIZE);
   if (psize > 0) {
@@ -145,7 +144,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   }
   config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
   config->has_overcommit = unix_detect_overcommit();
-  config->must_free_whole = false;    // mmap can free in parts
+  config->has_partial_free = true;    // mmap can free in parts
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
 
   // disable transparent huge pages for this process?
@@ -198,12 +197,12 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
     size_t n = mi_bsr(try_alignment);
     if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
       p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
-      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { 
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
         int err = errno;
-        _mi_warning_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
+        _mi_trace_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
       }
       if (p!=MAP_FAILED) return p;
-      // fall back to regular mmap      
+      // fall back to regular mmap
     }
   }
   #elif defined(MAP_ALIGN)  // Solaris
@@ -219,16 +218,16 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
     void* hint = _mi_os_get_aligned_hint(try_alignment, size);
     if (hint != NULL) {
       p = mmap(hint, size, protect_flags, flags, fd, 0);
-      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { 
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
         #if MI_TRACK_ENABLED  // asan sometimes does not instrument errno correctly?
         int err = 0;
         #else
         int err = errno;
         #endif
-        _mi_warning_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint);
+        _mi_trace_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint);
       }
       if (p!=MAP_FAILED) return p;
-      // fall back to regular mmap      
+      // fall back to regular mmap
     }
   }
   #endif
@@ -357,9 +356,9 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
-  
+
   *is_zero = true;
-  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);  
+  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
   *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
   return (*addr != NULL ? 0 : errno);
 }
@@ -387,19 +386,19 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
   // was either from mmap PROT_NONE, or from decommit MADV_DONTNEED, but
   // we sometimes call commit on a range with still partially committed
   // memory and `mprotect` does not zero the range.
-  *is_zero = false;  
+  *is_zero = false;
   int err = mprotect(start, size, (PROT_READ | PROT_WRITE));
-  if (err != 0) { 
-    err = errno; 
+  if (err != 0) {
+    err = errno;
     unix_mprotect_hint(err);
   }
   return err;
 }
 
 int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
-  int err = 0;  
+  int err = 0;
   // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
-  err = unix_madvise(start, size, MADV_DONTNEED);    
+  err = unix_madvise(start, size, MADV_DONTNEED);
   #if !MI_DEBUG && !MI_SECURE
     *needs_recommit = false;
   #else
@@ -411,15 +410,15 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
   *needs_recommit = true;
   const int fd = unix_mmap_fd();
   void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
-  if (p != start) { err = errno; }    
+  if (p != start) { err = errno; }
   */
   return err;
 }
 
 int _mi_prim_reset(void* start, size_t size) {
-  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it 
+  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it
   // will not reduce the `rss` stats in tools like `top` even though the memory is available
-  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by 
+  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
   // default `MADV_DONTNEED` is used though.
   #if defined(MADV_FREE)
   static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
@@ -439,7 +438,7 @@ int _mi_prim_reset(void* start, size_t size) {
 
 int _mi_prim_protect(void* start, size_t size, bool protect) {
   int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
-  if (err != 0) { err = errno; }  
+  if (err != 0) { err = errno; }
   unix_mprotect_hint(err);
   return err;
 }
@@ -480,7 +479,7 @@ int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bo
     if (err != 0) {
       err = errno;
       _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d (error: %d (0x%x))\n", numa_node, err, err);
-    }    
+    }
   }
   return (*addr != NULL ? 0 : errno);
 }
@@ -595,9 +594,9 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // low resolution timer
 mi_msecs_t _mi_prim_clock_now(void) {
   #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
-  return (mi_msecs_t)clock();  
+  return (mi_msecs_t)clock();
   #elif (CLOCKS_PER_SEC < 1000)
-  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);  
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
   #else
   return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
   #endif
@@ -637,7 +636,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   pinfo->stime = timeval_secs(&rusage.ru_stime);
 #if !defined(__HAIKU__)
   pinfo->page_faults = rusage.ru_majflt;
-#endif  
+#endif
 #if defined(__HAIKU__)
   // Haiku does not have (yet?) a way to
   // get these stats per process
@@ -764,7 +763,7 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
   // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
   // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
-  return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);  
+  return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
 }
 
 #elif defined(__ANDROID__) || defined(__DragonFly__) || \
@@ -772,7 +771,6 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
       defined(__sun) || \
       (defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7))
 
-#include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
   arc4random_buf(buf, buf_len);
   return true;
@@ -863,7 +861,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
   }
 }
 
-#else 
+#else
 
 void _mi_prim_thread_init_auto_done(void) {
   // nothing
diff --git a/third-party/mimalloc/src/prim/wasi/prim.c b/third-party/mimalloc/src/prim/wasi/prim.c
index f74acd2a..e95f67f5 100644
--- a/third-party/mimalloc/src/prim/wasi/prim.c
+++ b/third-party/mimalloc/src/prim/wasi/prim.c
@@ -23,7 +23,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
   config->alloc_granularity = 16;
   config->has_overcommit = false;  
-  config->must_free_whole = true;
+  config->has_partial_free = false;
   config->has_virtual_reserve = false;
 }
 
diff --git a/third-party/mimalloc/src/prim/windows/prim.c b/third-party/mimalloc/src/prim/windows/prim.c
index 2dd7c602..5074ad4c 100644
--- a/third-party/mimalloc/src/prim/windows/prim.c
+++ b/third-party/mimalloc/src/prim/windows/prim.c
@@ -112,7 +112,7 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
 void _mi_prim_mem_init( mi_os_mem_config_t* config )
 {
   config->has_overcommit = false;
-  config->must_free_whole = true;
+  config->has_partial_free = false;
   config->has_virtual_reserve = true;
   // get the page size
   SYSTEM_INFO si;
@@ -178,7 +178,7 @@ int _mi_prim_free(void* addr, size_t size ) {
 // VirtualAlloc
 //---------------------------------------------
 
-static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_alignment, DWORD flags) {
   #if (MI_INTPTR_SIZE >= 8)
   // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
   if (addr == NULL) {
@@ -200,13 +200,53 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen
     param.Arg.Pointer = &reqs;
     void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
     if (p != NULL) return p;
-    _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
+    _mi_warning_message("unable to allocate aligned OS memory (0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
     // fall through on error
   }
   // last resort
   return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }
 
+static bool win_is_out_of_memory_error(DWORD err) {
+  switch (err) {
+    case ERROR_COMMITMENT_MINIMUM:
+    case ERROR_COMMITMENT_LIMIT:
+    case ERROR_PAGEFILE_QUOTA:
+    case ERROR_NOT_ENOUGH_MEMORY:
+      return true;
+    default:
+      return false;
+  }
+}
+
+static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+  long max_retry_msecs = mi_option_get_clamp(mi_option_retry_on_oom, 0, 2000);  // at most 2 seconds
+  if (max_retry_msecs == 1) { max_retry_msecs = 100; }  // if one sets the option to "true"
+  for (long tries = 1; tries <= 10; tries++) {          // try at most 10 times (=2200ms)
+    void* p = win_virtual_alloc_prim_once(addr, size, try_alignment, flags);
+    if (p != NULL) {
+      // success, return the address
+      return p;
+    }
+    else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) &&
+              (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
+              win_is_out_of_memory_error(GetLastError())) {
+      // if committing regular memory and being out-of-memory, 
+      // keep trying for a bit in case memory frees up after all. See issue #894
+      _mi_warning_message("out-of-memory on OS allocation, try again... (attempt %lu, 0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", tries, size, GetLastError(), addr, try_alignment, flags);
+      long sleep_msecs = tries*40;  // increasing waits
+      if (sleep_msecs > max_retry_msecs) { sleep_msecs = max_retry_msecs; }
+      max_retry_msecs -= sleep_msecs;
+      Sleep(sleep_msecs);
+    }
+    else {
+      // otherwise return with an error
+      break;
+    }
+  }
+  return NULL;
+}
+
 static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
   mi_assert_internal(!(large_only && !allow_large));
   static _Atomic(size_t) large_page_try_ok; // = 0;
@@ -572,6 +612,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 #if !defined(MI_SHARED_LIB)
 
 // use thread local storage keys to detect thread ending
+// note: another design could be to use special linker sections (see issue #869)
 #include <fibersapi.h>
 #if (_WIN32_WINNT < 0x600)  // before Windows Vista
 WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
diff --git a/third-party/mimalloc/src/segment.c b/third-party/mimalloc/src/segment.c
index 9ac22f15..4e4dcb80 100644
--- a/third-party/mimalloc/src/segment.c
+++ b/third-party/mimalloc/src/segment.c
@@ -347,7 +347,7 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
 }
 
 
-static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, size_t* info_slices) {
+static size_t mi_segment_calculate_slices(size_t required, size_t* info_slices) {
   size_t page_size = _mi_os_page_size();
   size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
   size_t guardsize = 0;
@@ -361,7 +361,6 @@ static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, siz
     }
   }
 
-  if (pre_size != NULL) *pre_size = isize;
   isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE);
   if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE;
   size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );
@@ -624,7 +623,9 @@ static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size
   mi_assert_internal(slice->slice_count == slice_count); // no overflow?
   slice->slice_offset = 0;
   if (slice_count > 1) {
-    mi_slice_t* last = &segment->slices[slice_index + slice_count - 1];
+    mi_slice_t* last = slice + slice_count - 1;
+    mi_slice_t* end  = (mi_slice_t*)mi_segment_slices_end(segment);
+    if (last > end) { last = end; }
     last->slice_count = 0;
     last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1));
     last->block_size = 0;
@@ -808,7 +809,7 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren
 ----------------------------------------------------------- */
 
 static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delayed, mi_arena_id_t req_arena_id,
-                                          size_t* psegment_slices, size_t* ppre_size, size_t* pinfo_slices,
+                                          size_t* psegment_slices, size_t* pinfo_slices,
                                           bool commit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 
 {
@@ -825,7 +826,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
     align_offset = _mi_align_up( info_size, MI_SEGMENT_ALIGN );
     const size_t extra = align_offset - info_size;
     // recalculate due to potential guard pages
-    *psegment_slices = mi_segment_calculate_slices(required + extra, ppre_size, pinfo_slices);
+    *psegment_slices = mi_segment_calculate_slices(required + extra, pinfo_slices);
     mi_assert_internal(*psegment_slices > 0 && *psegment_slices <= UINT32_MAX);
   }
 
@@ -874,8 +875,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
 
   // calculate needed sizes first
   size_t info_slices;
-  size_t pre_size;
-  size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices);
+  size_t segment_slices = mi_segment_calculate_slices(required, &info_slices);
   mi_assert_internal(segment_slices > 0 && segment_slices <= UINT32_MAX);
 
   // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
@@ -887,7 +887,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
 
   // Allocate the segment from the OS
   mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id,
-                                              &segment_slices, &pre_size, &info_slices, commit, tld, os_tld);
+                                              &segment_slices, &info_slices, commit, tld, os_tld);
   if (segment == NULL) return NULL;
 
   // zero the segment info? -- not always needed as it may be zero initialized from the OS
@@ -915,8 +915,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
   if (MI_SECURE>0) {
     // in secure mode, we set up a protected page in between the segment info
     // and the page data, and at the end of the segment.
-    size_t os_pagesize = _mi_os_page_size();
-    mi_assert_internal(mi_segment_info_size(segment) - os_pagesize >= pre_size);
+    size_t os_pagesize = _mi_os_page_size();    
     _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
     uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
     mi_segment_ensure_committed(segment, end, os_pagesize, tld->stats);
@@ -1007,11 +1006,13 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
     _mi_os_reset(start, psize, tld->stats);
   }
 
-  // zero the page data, but not the segment fields
+  // zero the page data, but not the segment fields and heap tag
   page->is_zero_init = false;
+  uint8_t heap_tag = page->heap_tag;
   ptrdiff_t ofs = offsetof(mi_page_t, capacity);
   _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
   page->block_size = 1;
+  page->heap_tag = heap_tag;
 
   // and free it
   mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);
@@ -1212,8 +1213,13 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       mi_assert_internal(page->next == NULL && page->prev==NULL);
       _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
       segment->abandoned--;
-      // set the heap again and allow delayed free again
-      mi_page_set_heap(page, heap);
+      // set the heap again and allow heap thread delayed free again.
+      mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
+      if (target_heap == NULL) {
+        target_heap = heap;
+        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using %u instead)\n", page->heap_tag, heap->tag );
+      }
+      mi_page_set_heap(page, target_heap);
       _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
       _mi_page_free_collect(page, false); // ensure used count is up to date
       if (mi_page_all_free(page)) {
@@ -1222,8 +1228,8 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       }
       else {
         // otherwise reclaim it into the heap
-        _mi_page_reclaim(heap, page);
-        if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page)) {
+        _mi_page_reclaim(target_heap, page);
+        if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page) && heap == target_heap) {
           if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
         }
       }
diff --git a/third-party/tbb/.bazelversion b/third-party/tbb/.bazelversion
index 09b254e9..21c8c7b4 100644
--- a/third-party/tbb/.bazelversion
+++ b/third-party/tbb/.bazelversion
@@ -1 +1 @@
-6.0.0
+7.1.1
diff --git a/third-party/tbb/.github/CODEOWNERS b/third-party/tbb/.github/CODEOWNERS
new file mode 100644
index 00000000..31805797
--- /dev/null
+++ b/third-party/tbb/.github/CODEOWNERS
@@ -0,0 +1,7 @@
+# Lines starting with '#' are comments.
+# Each line is a file pattern followed by one or more owners.
+
+# More details are here: https://help.github.com/articles/about-codeowners/
+
+src/tbbmalloc @ldorau @lplewa @kfilipek
+src/tbbmalloc_proxy @ldorau @lplewa @kfilipek
diff --git a/third-party/tbb/.github/workflows/ci.yml b/third-party/tbb/.github/workflows/ci.yml
index a6d710f8..a65de622 100644
--- a/third-party/tbb/.github/workflows/ci.yml
+++ b/third-party/tbb/.github/workflows/ci.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023 Intel Corporation
+# Copyright (c) 2021-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@ on:
       - synchronize
       - reopened
 
+permissions: read-all
+
 env:
   BUILD_CONCURRENCY: 2
   MACOS_BUILD_CONCURRENCY: 3
@@ -57,7 +59,7 @@ jobs:
     needs: [codespell]
     env:
       BUILD_TYPE: oss
-    runs-on: [ubuntu-20.04]
+    runs-on: [ubuntu-22.04]
     timeout-minutes: 10
     steps:
       - uses: actions/checkout@v2
@@ -80,6 +82,10 @@ jobs:
 
   pages:
     if: ${{ github.ref == 'refs/heads/master' }}
+    permissions:
+      contents: write
+      pages: write
+      id-token: write
     runs-on: ubuntu-latest
     needs: [documentation]
     steps:
@@ -140,7 +146,7 @@ jobs:
           ctest -R python_test --output-on-failure --timeout ${TEST_TIMEOUT}
 
   linux-testing:
-    name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }}
+    name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }}${{ matrix.cmake_static }}
     runs-on: ['${{ matrix.os }}']
     timeout-minutes: 45
     strategy:
@@ -165,6 +171,13 @@ jobs:
             std: 20
             build_type: debug
             preview: 'ON'
+          - os: ubuntu-22.04
+            c_compiler: gcc-11
+            cxx_compiler: g++-11
+            std: 20
+            build_type: release
+            preview: 'ON'
+            cmake_static: -DBUILD_SHARED_LIBS=OFF
     steps:
       - uses: actions/checkout@v2
       - name: Run testing
@@ -172,13 +185,13 @@ jobs:
         run: |
           set -x
           mkdir build && cd build
-          cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+          cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.cmake_static }} \
             -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} ..
           make VERBOSE=1 -j${BUILD_CONCURRENCY}
           ctest --timeout ${TEST_TIMEOUT} --output-on-failure
 
   macos-testing:
-    name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }}
+    name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }}${{ matrix.cmake_static }}
     runs-on: ['${{ matrix.os }}']
     timeout-minutes: 45
     strategy:
@@ -191,6 +204,13 @@ jobs:
             std: 14
             build_type: relwithdebinfo
             preview: 'ON'
+          - os: macos-13
+            c_compiler: clang
+            cxx_compiler: clang++
+            std: 20
+            build_type: release
+            preview: 'ON'
+            cmake_static: -DBUILD_SHARED_LIBS=OFF
     steps:
       - uses: actions/checkout@v2
       - name: Run testing
@@ -198,7 +218,7 @@ jobs:
         run: |
           set -x
           mkdir build && cd build
-          cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+          cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.cmake_static }} \
             -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} ..
           make VERBOSE=1 -j${MACOS_BUILD_CONCURRENCY}
           ctest --timeout ${TEST_TIMEOUT} --output-on-failure
@@ -219,6 +239,15 @@ jobs:
             build_type: relwithdebinfo
             preview: 'ON'
             job_name: windows_cl2019_cxx14_relwithdebinfo_preview=ON
+          - os: windows-2019
+            generator: Visual Studio 16 2019
+            c_compiler: cl
+            cxx_compiler: cl
+            std: 20
+            build_type: release
+            preview: 'ON'
+            job_name: windows_cl2019_cxx20_release_preview=ON-DBUILD_SHARED_LIBS=OFF
+            cmake_static: -DBUILD_SHARED_LIBS=OFF
           - os: windows-2022
             generator: Visual Studio 17 2022
             c_compiler: cl
@@ -233,7 +262,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake -G "${{ matrix.generator }}" -A x64 -DCMAKE_CXX_STANDARD=${{ matrix.std }} `
+          cmake -G "${{ matrix.generator }}" -A x64 -DCMAKE_CXX_STANDARD=${{ matrix.std }} ${{ matrix.cmake_static }} `
             -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} `
             -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} ..
           cmake --build . --config ${{ matrix.build_type }} -j -v
diff --git a/third-party/tbb/.github/workflows/issue_labeler.yml b/third-party/tbb/.github/workflows/issue_labeler.yml
index 418d7bac..80591aa9 100644
--- a/third-party/tbb/.github/workflows/issue_labeler.yml
+++ b/third-party/tbb/.github/workflows/issue_labeler.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 Intel Corporation
+# Copyright (c) 2023-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@ on:
   pull_request:
     types: [opened, edited]
 
+permissions: read-all
+
 jobs:
   triage:
     runs-on: ubuntu-latest
diff --git a/third-party/tbb/.github/workflows/labeler.yml b/third-party/tbb/.github/workflows/labeler.yml
index 8dbb0962..36812ebd 100644
--- a/third-party/tbb/.github/workflows/labeler.yml
+++ b/third-party/tbb/.github/workflows/labeler.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 Intel Corporation
+# Copyright (c) 2023-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@ name: "Pull Request Labeler"
 on:
   - pull_request_target
 
+permissions: read-all
+
 jobs:
   triage:
     permissions:
diff --git a/third-party/tbb/BUILD.bazel b/third-party/tbb/BUILD.bazel
index 3881d684..34f98eba 100644
--- a/third-party/tbb/BUILD.bazel
+++ b/third-party/tbb/BUILD.bazel
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022 Intel Corporation
+# Copyright (c) 2021-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -116,3 +116,16 @@ cc_library(
         ":tbbmalloc",
     ],
 )
+
+cc_test(
+    name = "test_task",
+    srcs = [
+        "test/tbb/test_task.cpp",
+    ] + glob([
+        "test/common/*.h",
+    ]),
+    includes = ["test"],
+    deps = [
+        ":tbb",
+    ],
+)
diff --git a/third-party/tbb/CMakeLists.txt b/third-party/tbb/CMakeLists.txt
index 16ee29ed..19232a99 100644
--- a/third-party/tbb/CMakeLists.txt
+++ b/third-party/tbb/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 # Enable CMake policies
 
+if (POLICY CMP0068)
+    # RPATH settings do not affect install_name on macOS since CMake 3.9
+    cmake_policy(SET CMP0068 NEW)
+endif()
+
 if (POLICY CMP0091)
     # The NEW behavior for this policy is to not place MSVC runtime library flags in the default
     # CMAKE_<LANG>_FLAGS_<CONFIG> cache entries and use CMAKE_MSVC_RUNTIME_LIBRARY abstraction instead.
@@ -38,12 +43,6 @@ if (APPLE)
     endif()
 endif()
 
-# Until CMake 3.4.0 FindThreads.cmake requires C language enabled.
-# Enable C language before CXX to avoid possible override of CMAKE_SIZEOF_VOID_P.
-if (CMAKE_VERSION VERSION_LESS 3.4)
-    enable_language(C)
-endif()
-
 file(READ include/oneapi/tbb/version.h _tbb_version_info)
 string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1" _tbb_ver_major "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" _tbb_ver_minor "${_tbb_version_info}")
@@ -104,9 +103,13 @@ option(TBBMALLOC_BUILD "Enable tbbmalloc build" ON)
 cmake_dependent_option(TBBMALLOC_PROXY_BUILD "Enable tbbmalloc_proxy build" ON "TBBMALLOC_BUILD" OFF)
 option(TBB_CPF "Enable preview features of the library" OFF)
 option(TBB_FIND_PACKAGE "Enable search for external oneTBB using find_package instead of build from sources" OFF)
-option(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH "Disable HWLOC automatic search by pkg-config tool" OFF)
+option(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH "Disable HWLOC automatic search by pkg-config tool" ${CMAKE_CROSSCOMPILING})
 option(TBB_ENABLE_IPO "Enable Interprocedural Optimization (IPO) during the compilation" ON)
 option(TBB_FUZZ_TESTING "Enable fuzz testing" OFF)
+option(TBB_INSTALL "Enable installation" ON)
+if(APPLE)
+option(TBB_BUILD_APPLE_FRAMEWORKS "Build as Apple Frameworks" OFF)
+endif()
 
 if (NOT DEFINED BUILD_SHARED_LIBS)
     set(BUILD_SHARED_LIBS ON)
@@ -194,7 +197,7 @@ endif()
 # -------------------------------------------------------------------
 # Common dependencies
 #force -pthread during compilation for Emscripten
-if (EMSCRIPTEN)
+if (EMSCRIPTEN AND NOT EMSCRIPTEN_WITHOUT_PTHREAD)
    set(THREADS_HAVE_PTHREAD_ARG TRUE)
 endif()
 
@@ -230,7 +233,7 @@ else()
     message(WARNING "TBB compiler settings not found ${TBB_COMPILER_SETTINGS_FILE}")
 endif()
 
-if (TBB_FIND_PACKAGE OR TBB_DIR)
+if (TBB_FIND_PACKAGE AND TBB_DIR)
     # Allow specifying external TBB to test with.
     # Do not add main targets and installation instructions in that case.
     message(STATUS "Using external TBB for testing")
@@ -250,34 +253,39 @@ else()
     else()
         add_subdirectory(src/tbbbind)
     endif()
+    if (TBB_INSTALL)
+        # -------------------------------------------------------------------
+        # Installation instructions
+        include(CMakePackageConfigHelpers)
+
+        install(DIRECTORY include/
+                DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+                COMPONENT devel)
+
+        install(EXPORT ${PROJECT_NAME}Targets
+                NAMESPACE TBB::
+                DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+                COMPONENT devel)
+        file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+                   "include(\${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Targets.cmake)\n")
+        if (NOT BUILD_SHARED_LIBS)
+            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+                       "include(CMakeFindDependencyMacro)\nfind_dependency(Threads)\n")
+        endif()
+
+        write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+                                         COMPATIBILITY AnyNewerVersion)
 
-    # -------------------------------------------------------------------
-    # Installation instructions
-    include(CMakePackageConfigHelpers)
-
-    install(DIRECTORY include/
-            DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-            COMPONENT devel)
-
-    install(EXPORT ${PROJECT_NAME}Targets
-            NAMESPACE TBB::
-            DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
-            COMPONENT devel)
-    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
-               "include(\${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Targets.cmake)\n")
-
-    write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
-                                     COMPATIBILITY AnyNewerVersion)
-
-    install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
-                  "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
-            DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
-            COMPONENT devel)
-
-    install(FILES "README.md"
-            DESTINATION ${CMAKE_INSTALL_DOCDIR}
-            COMPONENT devel)
-    # -------------------------------------------------------------------
+        install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+                      "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+                DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+                COMPONENT devel)
+
+        install(FILES "README.md"
+                DESTINATION ${CMAKE_INSTALL_DOCDIR}
+                COMPONENT devel)
+        # -------------------------------------------------------------------
+    endif()
 endif()
 
 if (TBB_TEST)
diff --git a/third-party/tbb/CONTRIBUTING.md b/third-party/tbb/CONTRIBUTING.md
index c8b43708..3048b211 100644
--- a/third-party/tbb/CONTRIBUTING.md
+++ b/third-party/tbb/CONTRIBUTING.md
@@ -29,11 +29,6 @@ The DCO is an attestation attached to every contribution made by every developer
 
 As a contributor, you’ll want to be familiar with the oneTBB project and the repository layout. You should also know how to use it as explained in the [oneTBB documentation](https://oneapi-src.github.io/oneTBB/) and how to set up your build development environment to configure, build, and test oneTBB as explained in the [oneTBB Build System Description](cmake/README.md). 
 
-## Issues 
-If you face a problem, first check out open [oneTBB GitHub issues](https://github.com/oneapi-src/oneTBB/issues) to see if the issue you’d like to address is already reported. You may find users that have encountered the bug you’re finding or have similar ideas for changes or additions.
-
-You can use issues to report a problem, make a feature request, or add comments on an existing issue. 
-
 ## Pull Requests 
 
 You can find all [open oneTBB pull requests](https://github.com/oneapi-src/oneTBB/pulls) on GitHub. 
diff --git a/third-party/tbb/INSTALL.md b/third-party/tbb/INSTALL.md
index 3c63c9fd..0ac95f87 100644
--- a/third-party/tbb/INSTALL.md
+++ b/third-party/tbb/INSTALL.md
@@ -61,7 +61,7 @@ You can use the ``install`` components for partial installation.
 The following install components are supported:
 - `runtime` - oneTBB runtime package (core shared libraries and `.dll` files on Windows* OS).
 - `devel` - oneTBB development package (header files, CMake integration files, library symbolic links, and `.lib` files on Windows* OS).
-- `tbb4py` - [oneTBB Module for Python](#onetbb-python-module-support).
+- `tbb4py` - [oneTBB Module for Python](https://github.com/oneapi-src/oneTBB/blob/master/python/README.md).
 
 If you want to install specific components after configuration and build, run:
 
diff --git a/third-party/tbb/MODULE.bazel b/third-party/tbb/MODULE.bazel
new file mode 100644
index 00000000..cc6698f0
--- /dev/null
+++ b/third-party/tbb/MODULE.bazel
@@ -0,0 +1,24 @@
+# Copyright (c) 2021-2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: Bazel support is community-based. The maintainers do not
+# use Bazel internally. The Bazel build can have security risks or
+# optimization gaps.
+
+module(
+    name = "onetbb",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "platforms", version = "0.0.9")
diff --git a/third-party/tbb/README.md b/third-party/tbb/README.md
index b96e1fb0..f2bc0a0a 100644
--- a/third-party/tbb/README.md
+++ b/third-party/tbb/README.md
@@ -23,7 +23,8 @@ oneTBB is a part of [oneAPI](https://oneapi.io). The current branch implements v
 > **_NOTE:_** Threading Building Blocks (TBB) is now called oneAPI Threading Building Blocks (oneTBB) to highlight that the tool is a part of the oneAPI ecosystem.
 
 ## Release Information
-Here are [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQUIREMENTS.md).
+
+See [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQUIREMENTS.md).
 
 ## Documentation
 * [oneTBB Specification](https://spec.oneapi.com/versions/latest/elements/oneTBB/source/nested-index.html)
@@ -39,7 +40,7 @@ Here are [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQU
 See [Installation from Sources](INSTALL.md) to learn how to install oneTBB. 
 
 ## Support
-Please report issues and suggestions via [GitHub issues](https://github.com/oneapi-src/oneTBB/issues). See our [documentation](./CONTRIBUTING.md##Issues) to learn how to work with them.
+See our [documentation](./SUPPORT.md) to learn how to request help.
 
 ## How to Contribute
 We welcome community contributions, so check our [Contributing Guidelines](CONTRIBUTING.md)
@@ -49,7 +50,6 @@ to learn more.
 oneAPI Threading Building Blocks is licensed under [Apache License, Version 2.0](LICENSE.txt).
 By its terms, contributions submitted to the project are also done under that license.
 
-
 ## Engineering team contacts
 * [Email us.](mailto:inteltbbdevelopers@intel.com)
 
diff --git a/third-party/tbb/RELEASE_NOTES.md b/third-party/tbb/RELEASE_NOTES.md
index 57258416..c9b8e971 100644
--- a/third-party/tbb/RELEASE_NOTES.md
+++ b/third-party/tbb/RELEASE_NOTES.md
@@ -18,26 +18,25 @@
 This document contains changes of oneTBB compared to the last release.
 
 ## Table of Contents <!-- omit in toc -->
-- [New Features](#new-features)
 - [Known Limitations](#known-limitations)
 - [Fixed Issues](#fixed-issues)
 
-## :tada: New Features
-- Since C++17, parallel algorithms and Flow Graph nodes are allowed to accept pointers to the member functions and member objects as the user-provided callables.
-- Added missed member functions, such as assignment operators and swap function, to the ``concurrent_queue`` and ``concurrent_bounded_queue`` containers.
-
 ## :rotating_light: Known Limitations
-- A static assert will cause compilation failures in oneTBB headers when compiling with clang 12.0.0 or newer if using the LLVM standard library with ``-ffreestanding`` and C++11/14 compiler options. 
-- An application using Parallel STL algorithms in libstdc++ versions 9 and 10 may fail to compile due to incompatible interface changes between earlier versions of Threading Building Blocks (TBB) and oneAPI Threading Building Blocks (oneTBB). Disable support for Parallel STL algorithms by defining ``PSTL_USE_PARALLEL_POLICIES`` (in libstdc++ 9) or ``_GLIBCXX_USE_TBB_PAR_BACKEND`` (in libstdc++ 10) macro to zero before inclusion of the first standard header file in each translation unit.
-- On Linux* OS, if oneAPI Threading Building Blocks (oneTBB) or Threading Building Blocks (TBB) are installed in a system folder like ``/usr/lib64``, the application may fail to link due to the order in which the linker searches for libraries. Use the ``-L`` linker option to specify the correct location of oneTBB library. This issue does not affect the program execution.
-- The ``oneapi::tbb::info`` namespace interfaces might unexpectedly change the process affinity mask on Windows* OS systems (see https://github.com/open-mpi/hwloc/issues/366 for details) when using hwloc* version lower than 2.5.
-- Using a hwloc* version other than 1.11, 2.0, or 2.5 may cause an undefined behavior on Windows* OS. See https://github.com/open-mpi/hwloc/issues/477 for details.
-- The NUMA* topology may be detected incorrectly on Windows* OS machines where the number of NUMA* node threads exceeds the size of 1 processor group.
-- On Windows* OS on ARM64*, when compiling an application using oneTBB with the Microsoft* Compiler, the compiler issues a warning C4324 that a structure was padded due to the alignment specifier. Consider suppressing the warning by specifying ``/wd4324`` to the compiler command line.
-- oneTBB does not support ``fork()``, to work-around the issue, consider using task_scheduler_handle to join oneTBB worker threads before using fork().
-- C++ exception handling mechanism on Windows* OS on ARM64* might corrupt memory if an exception is thrown from any oneTBB parallel algorithm (see Windows* OS on ARM64* compiler issue: https://developercommunity.visualstudio.com/t/ARM64-incorrect-stack-unwinding-for-alig/1544293).
+- The ``oneapi::tbb::info`` namespace interfaces might unexpectedly change the process affinity mask on Windows* OS systems (see https://github.com/open-mpi/hwloc/issues/366 for details) when using hwloc version lower than 2.5.
+- Using a hwloc version other than 1.11, 2.0, or 2.5 may cause an undefined behavior on Windows OS. See https://github.com/open-mpi/hwloc/issues/477 for details.
+- The NUMA topology may be detected incorrectly on Windows* OS machines where the number of NUMA node threads exceeds the size of 1 processor group.
+- On Windows OS on ARM64*, when compiling an application using oneTBB with the Microsoft* Compiler, the compiler issues a warning C4324 that a structure was padded due to the alignment specifier. Consider suppressing the warning by specifying /wd4324 to the compiler command line.
+- C++ exception handling mechanism on Windows* OS on ARM64* might corrupt memory if an exception is thrown from any oneTBB parallel algorithm (see Windows* OS on ARM64* compiler issue: https://developercommunity.visualstudio.com/t/ARM64-incorrect-stack-unwinding-for-alig/1544293.
+- When CPU resource coordination is enabled, tasks from a lower-priority ``task_arena`` might be executed before tasks from a higher-priority ``task_arena``.
+
+> **_NOTE:_**  To see known limitations that impact all versions of oneTBB, refer to [oneTBB Documentation](https://oneapi-src.github.io/oneTBB/main/intro/limitations.html).
+
 
 ## :hammer: Fixed Issues
-- Fixed the hang in the reserve method of concurrent unordered containers ([GitHub* #1056](http://github.com/oneapi-src/oneTBB/issues/1056)).
-- Fixed the C++20 three-way comparison feature detection ([GitHub* #1093](http://github.com/oneapi-src/oneTBB/issues/1093)).
-- Fixed oneTBB integration with CMake* in the Conda* environment.
+- Fixed ``parallel_for_each`` algorithm behavior for iterators defining ``iterator_concept`` trait instead of ``iterator_category``.
+- Fixed the redefinition issue for ``std::min`` and ``std::max`` on Windows* OS ([GitHub* #832](https://github.com/oneapi-src/oneTBB/issues/832)).
+- Fixed the incorrect binary search order in ``TBBConfig.cmake``.
+- Enabled the oneTBB library search using the pkg-config tool in Conda packages.
+
+## :octocat: Open-source Contributions Integrated
+- Fixed the compiler warning for missing virtual destructor. Contributed by Elias Engelbert Plank (https://github.com/oneapi-src/oneTBB/pull/1215).
diff --git a/third-party/tbb/SECURITY.md b/third-party/tbb/SECURITY.md
index c4a49dd5..4926041f 100644
--- a/third-party/tbb/SECURITY.md
+++ b/third-party/tbb/SECURITY.md
@@ -1,7 +1,66 @@
 # Security Policy
-Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, 
-impact, severity and mitigation. 
+As an open-source project, we understand the importance of and responsibility
+for security. This Security Policy outlines our guidelines and procedures to
+ensure the highest level of security and trust for oneTBB users. 
 
-## Reporting a Vulnerability
-Please report any security vulnerabilities in this project 
-[utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
+## Supported Versions
+Security vulnerabilities are fixed in the [latest version][1]
+and delivered as a patch release. We don't guarantee security fixes to be
+back-ported to older oneTBB versions.
+
+## Report a Vulnerability
+We are very grateful to the security researchers and users that report back
+security vulnerabilities. We investigate every report thoroughly.
+We strongly encourage you to report security vulnerabilities to us privately,
+before disclosing them on public forums or opening a public GitHub* issue. 
+
+Report a vulnerability to us in one of two ways:
+* Open a draft **[GitHub* Security Advisory][2]**
+* Send an e-mail to: **security@uxlfoundation.org**.
+Along with the report, provide the following info:
+  * A descriptive title.
+  * Your name and affiliation (if any).
+  * A description of the technical details of the vulnerabilities.
+  * A minimal example of the vulnerability so we can reproduce your findings.
+  * An explanation of who can exploit this vulnerability, and what they gain
+  doing so. 
+  * Whether this vulnerability is public or known to third parties. If it is,
+  provide details.
+
+### When Should I Report a Vulnerability?
+* You think you discovered a potential security vulnerability in oneTBB.
+* You are unsure how the potential vulnerability affects oneTBB.
+* You think you discovered a vulnerability in another project or 3rd party
+component on which oneTBB depends. If the issue is not fixed in the 3rd party
+component, try to report directly there first.
+
+### When Should I NOT Report a Vulnerability?
+* You got an automated scan hit and are unable to provide details.
+* You need help using oneTBB for security.
+* You need help applying security-related updates.
+* Your issue is not security-related.
+
+## Security Reports Review Process
+We aim to respond quickly to your inquiry and coordinate a fix and
+disclosure with you. All confirmed security vulnerabilities will be addressed
+according to severity level and impact on oneTBB. Normally, security issues
+are fixed in the next planned release.
+
+## Disclosure Policy
+We will publish security advisories using the 
+[**GitHub Security Advisories feature**][3]
+to keep our community well-informed, and will credit you for your findings
+unless you prefer to stay anonymous. We request that you refrain from
+exploiting the vulnerability or making it public before the official disclosure.
+
+We will disclose the vulnerabilities and bugs as soon as possible once
+mitigation is implemented and available. 
+
+## Feedback on This Policy
+If you have any suggestions on how this Policy could be improved, submit
+an issue or a pull request to this repository. **Do not** report
+potential vulnerabilities or security flaws via a pull request.
+
+[1]: https://github.com/oneapi-src/oneTBB/releases/latest
+[2]: https://github.com/oneapi-src/oneTBB/security/advisories/new
+[3]: https://github.com/oneapi-src/oneTBB/security/advisories
diff --git a/third-party/tbb/SUPPORT.md b/third-party/tbb/SUPPORT.md
new file mode 100644
index 00000000..47bb60a5
--- /dev/null
+++ b/third-party/tbb/SUPPORT.md
@@ -0,0 +1,35 @@
+<!--
+******************************************************************************
+* 
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/-->
+
+# oneTBB Support
+
+We are committed to providing support and assistance to help you make the most out of oneTBB. 
+Use the following methods if you face any challenges. 
+
+## Issues
+
+If you have a problem, check out the [GitHub Issues](https://github.com/oneapi-src/oneTBB/issues) to see if the issue you want to address is already reported. 
+You may find users that have encountered the same bug or have similar ideas for changes or updates.
+
+You can use issues to report a problem, make a feature request, or add comments on an existing issue.
+
+## Discussions 
+
+Visit the [GitHub Discussions](https://github.com/oneapi-src/oneTBB/discussions) to engage with the community, ask questions, or help others. 
+
+## Email
+
+Reach out to us privately via [email](mailto:inteltbbdevelopers@intel.com). 
\ No newline at end of file
diff --git a/third-party/tbb/WASM_Support.md b/third-party/tbb/WASM_Support.md
index 67925ee4..8c2f6c1a 100644
--- a/third-party/tbb/WASM_Support.md
+++ b/third-party/tbb/WASM_Support.md
@@ -16,16 +16,45 @@
 
 # WASM Support
 
+oneTBB extends its capabilities by offering robust support for ``WASM``. 
+
 ``WASM`` stands for WebAssembly, a low-level binary format for executing code in web browsers. 
-It is designed to be a portable target for compilers and to be efficient to parse and execute. 
+It is designed to be a portable target for compilers and efficient to parse and execute. 
+
+Using oneTBB with WASM, you can take full advantage of parallelism and concurrency while working on web-based applications, interactive websites, and a variety of other WASM-compatible platforms.
+
+oneTBB offers WASM support through the integration with [Emscripten*](https://emscripten.org/docs/introducing_emscripten/index.html), a powerful toolchain for compiling C and C++ code into WASM-compatible runtimes. 
+
+## Build
+
+**Prerequisites:** Download and install Emscripten*. See the [instructions](https://emscripten.org/docs/getting_started/downloads.html). 
+
+To build the system, run:
+
+```
+mkdir build && cd build
+emcmake cmake .. -DCMAKE_CXX_COMPILER=em++ -DCMAKE_C_COMPILER=emcc -DTBB_STRICT=OFF -DCMAKE_CXX_FLAGS=-Wno-unused-command-line-argument -DTBB_DISABLE_HWLOC_AUTOMATIC_SEARCH=ON -DBUILD_SHARED_LIBS=ON -DTBB_EXAMPLES=ON -DTBB_TEST=ON
+```
+To compile oneTBB without ``pthreads``, set the flag ``-DEMSCRIPTEN_WITHOUT_PTHREAD=true`` in the command above. By default, oneTBB uses the ``pthreads``.
+```
+cmake --build . <options>
+cmake --install . <options>
+```
+Where:
+
+* ``emcmake`` - a tool that sets up the environment for Emscripten*. 
+* ``-DCMAKE_CXX_COMPILER=em++`` - specifies the C++ compiler as Emscripten* C++ compiler. 
+* ``-DCMAKE_C_COMPILER=emcc`` - specifies the C compiler as Emscripten* C compiler.
+
 
-WebAssembly aims to provide a fast, efficient, and safe way to run code in web browsers without needing plugins or other software. Code written in a variety of programming languages, including C, C++, Rust and others, can be compiled into WebAssembly format for use in web pages. This allows you to write high-performance applications that run directly in the browser.
+> **_NOTE:_** See [CMake documentation](https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md) to learn about other options. 
 
-We currently have an [under development branch that provides you with WASM support](https://github.com/oneapi-src/oneTBB/tree/tbb_wasm). 
 
-By using WASM, you can:
-* Create highly performant and scalable applications that can meet the demands of modern web-based systems. 
-* Take advantage of oneTBB features to optimize the performance of your web-based applications.
+## Run Test
 
+To run tests, use:
 
+```
+ctest
+```
 
diff --git a/third-party/tbb/WORKSPACE.bazel b/third-party/tbb/WORKSPACE.bazel
index 6431b29b..59ba39f7 100644
--- a/third-party/tbb/WORKSPACE.bazel
+++ b/third-party/tbb/WORKSPACE.bazel
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 Intel Corporation
+# Copyright (c) 2021-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,4 +16,4 @@
 # use Bazel internally. The Bazel build can have security risks or 
 # optimization gaps.
 
-workspace(name = "oneTBB")
+# WORKSPACE marker file needed by Bazel
diff --git a/third-party/tbb/cmake/README.md b/third-party/tbb/cmake/README.md
index ff37ad8e..aa811b0f 100644
--- a/third-party/tbb/cmake/README.md
+++ b/third-party/tbb/cmake/README.md
@@ -14,10 +14,12 @@ TBBMALLOC_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB)
 TBBMALLOC_PROXY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator proxy build (requires TBBMALLOC_BUILD. ON by default)
 TBB4PY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) Python module build (OFF by default)
 TBB_CPF:BOOL - Enable preview features of the library (OFF by default)
+TBB_INSTALL:BOOL - Enable installation (ON by default)
 TBB_INSTALL_VARS:BOOL - Enable auto-generated vars installation(packages generated by `cpack` and `make install` will also include the vars script)(OFF by default)
 TBB_VALGRIND_MEMCHECK:BOOL - Enable scan for memory leaks using Valgrind (OFF by default)
 TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH - Disable HWLOC automatic search by pkg-config tool (OFF by default)
 TBB_ENABLE_IPO - Enable Interprocedural Optimization (IPO) during the compilation (ON by default)
+TBB_BUILD_APPLE_FRAMEWORKS - Enable the Apple* frameworks instead of dylibs, only available on the Apple platform. (OFF by default)
 ```
 
 ## Configure, Build, and Test
diff --git a/third-party/tbb/cmake/compilers/Clang.cmake b/third-party/tbb/cmake/compilers/Clang.cmake
index 7ce4d46d..f56b5fba 100644
--- a/third-party/tbb/cmake/compilers/Clang.cmake
+++ b/third-party/tbb/cmake/compilers/Clang.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,9 @@ if (EMSCRIPTEN)
   set(TBB_EMSCRIPTEN 1)
   set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fexceptions)
   set(TBB_TEST_LINK_FLAGS  ${TBB_COMMON_LINK_FLAGS} -fexceptions -sINITIAL_MEMORY=65536000 -sALLOW_MEMORY_GROWTH=1 -sEXIT_RUNTIME=1)
-  set_property(TARGET Threads::Threads PROPERTY INTERFACE_LINK_LIBRARIES "-pthread")
+  if (NOT EMSCRIPTEN_WITHOUT_PTHREAD)
+      set_property(TARGET Threads::Threads PROPERTY INTERFACE_LINK_LIBRARIES "-pthread")
+  endif()
 endif()
 
 if (MINGW)
@@ -52,7 +54,7 @@ if (NOT TBB_STRICT AND COMMAND tbb_remove_compile_flag)
 endif()
 
 # Enable Intel(R) Transactional Synchronization Extensions (-mrtm) and WAITPKG instructions support (-mwaitpkg) on relevant processors
-if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)")
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)" AND NOT EMSCRIPTEN)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},12.0>>:-mwaitpkg>)
 endif()
 
@@ -66,7 +68,9 @@ endif()
 
 set(TBB_COMMON_LINK_LIBS ${CMAKE_DL_LIBS})
 
-set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<CONFIG:Debug>>:-D_FORTIFY_SOURCE=2>)
+if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE")
+  set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<CONFIG:Debug>>:-D_FORTIFY_SOURCE=2>)
+endif ()
 
 if (MINGW)
     list(APPEND TBB_COMMON_COMPILE_FLAGS -U__STRICT_ANSI__)
diff --git a/third-party/tbb/cmake/compilers/GNU.cmake b/third-party/tbb/cmake/compilers/GNU.cmake
index 08c7f2e5..6fd8d980 100644
--- a/third-party/tbb/cmake/compilers/GNU.cmake
+++ b/third-party/tbb/cmake/compilers/GNU.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -71,12 +71,13 @@ endif ()
 set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv)
 set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -Wformat -Wformat-security -Werror=format-security
     -fstack-protector-strong )
-# -z switch is not supported on MacOS
-if (NOT APPLE)
+# -z switch is not supported on MacOS and MinGW
+if (NOT APPLE AND NOT MINGW)
     set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now,-z,noexecstack)
 endif()
-set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<CONFIG:Debug>>:-D_FORTIFY_SOURCE=2> )
-
+if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE")
+  set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<CONFIG:Debug>>:-D_FORTIFY_SOURCE=2> )
+endif ()
 
 # TBB malloc settings
 set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions)
diff --git a/third-party/tbb/cmake/compilers/Intel.cmake b/third-party/tbb/cmake/compilers/Intel.cmake
index 582f9a84..531e078e 100644
--- a/third-party/tbb/cmake/compilers/Intel.cmake
+++ b/third-party/tbb/cmake/compilers/Intel.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,11 @@ if (MSVC)
 elseif (APPLE)
     include(${CMAKE_CURRENT_LIST_DIR}/AppleClang.cmake)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fstack-protector -Wformat -Wformat-security
-                                 $<$<NOT:$<CONFIG:Debug>>:-fno-omit-frame-pointer -qno-opt-report-embed -D_FORTIFY_SOURCE=2>)
+                                 $<$<NOT:$<CONFIG:Debug>>:-fno-omit-frame-pointer -qno-opt-report-embed>)
+    if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE")
+        set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<CONFIG:Debug>>:-D_FORTIFY_SOURCE=2>)
+    endif ()
+
     set(TBB_OPENMP_FLAG -qopenmp)
     set(TBB_IPO_COMPILE_FLAGS $<$<NOT:$<CONFIG:Debug>>:-ipo>)
 else()
diff --git a/third-party/tbb/cmake/compilers/IntelLLVM.cmake b/third-party/tbb/cmake/compilers/IntelLLVM.cmake
index 89d56ae6..a9ebb3e6 100644
--- a/third-party/tbb/cmake/compilers/IntelLLVM.cmake
+++ b/third-party/tbb/cmake/compilers/IntelLLVM.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,10 +14,12 @@
 
 if (WIN32)
     include(${CMAKE_CURRENT_LIST_DIR}/MSVC.cmake)
+    set(TBB_OPENMP_FLAG /Qopenmp)
     set(TBB_IPO_COMPILE_FLAGS $<$<NOT:$<CONFIG:Debug>>:/Qipo>)
     set(TBB_IPO_LINK_FLAGS $<$<NOT:$<CONFIG:Debug>>:/INCREMENTAL:NO>)
 else()
     include(${CMAKE_CURRENT_LIST_DIR}/Clang.cmake)
     set(TBB_IPO_COMPILE_FLAGS $<$<NOT:$<CONFIG:Debug>>:-ipo>)
+    set(TBB_OPENMP_FLAG -qopenmp)
 endif()
 set(TBB_IPO_LINK_FLAGS ${TBB_IPO_LINK_FLAGS} ${TBB_IPO_COMPILE_FLAGS})
diff --git a/third-party/tbb/cmake/compilers/MSVC.cmake b/third-party/tbb/cmake/compilers/MSVC.cmake
index 0e0dfd31..6568ec7e 100644
--- a/third-party/tbb/cmake/compilers/MSVC.cmake
+++ b/third-party/tbb/cmake/compilers/MSVC.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,9 +33,9 @@ if (MSVC_VERSION LESS_EQUAL 1900)
     set(TBB_TEST_COMPILE_FLAGS ${TBB_TEST_COMPILE_FLAGS} /wd4503)
 endif()
 set(TBB_LIB_COMPILE_FLAGS -D_CRT_SECURE_NO_WARNINGS /GS)
-set(TBB_COMMON_COMPILE_FLAGS /volatile:iso /FS /EHsc)
+set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} /volatile:iso /FS /EHsc)
 
-set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /DYNAMICBASE /NXCOMPAT)
+set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /DEPENDENTLOADFLAG:0x2000 /DYNAMICBASE /NXCOMPAT)
 
 if (TBB_ARCH EQUAL 32)
     set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /SAFESEH )
diff --git a/third-party/tbb/cmake/config_generation.cmake b/third-party/tbb/cmake/config_generation.cmake
index 0cbdd745..e4ef7bce 100644
--- a/third-party/tbb/cmake/config_generation.cmake
+++ b/third-party/tbb/cmake/config_generation.cmake
@@ -92,6 +92,7 @@ set(_tbbbind_bin_version ${tbb_gen_cfg_TBBBIND_BINARY_VERSION})
                 NAMES \${_tbb_component}\${_bin_version}.dll
                 PATHS \${_tbb_root}
                 PATH_SUFFIXES \"redist/\${_tbb_intel_arch}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\" \"bin\"
+                NO_DEFAULT_PATH
             )
 
             if (EXISTS \"\${_tbb_debug_lib}\")
@@ -99,6 +100,7 @@ set(_tbbbind_bin_version ${tbb_gen_cfg_TBBBIND_BINARY_VERSION})
                     NAMES \${_tbb_component}\${_bin_version}_debug.dll
                     PATHS \${_tbb_root}
                     PATH_SUFFIXES \"redist/\${_tbb_intel_arch}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\" \"bin\"
+                    NO_DEFAULT_PATH
                 )
             endif()
 ")
diff --git a/third-party/tbb/cmake/hwloc_detection.cmake b/third-party/tbb/cmake/hwloc_detection.cmake
index 47233b17..aaca5a59 100644
--- a/third-party/tbb/cmake/hwloc_detection.cmake
+++ b/third-party/tbb/cmake/hwloc_detection.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,8 +46,6 @@ endforeach()
 unset(HWLOC_TARGET_NAME)
 
 if (NOT HWLOC_TARGET_EXPLICITLY_DEFINED AND
-    # No hwloc auto detection for cross compilation
-    NOT CMAKE_CROSSCOMPILING AND
     NOT TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH
 )
     find_package(PkgConfig QUIET)
diff --git a/third-party/tbb/cmake/templates/TBBConfig.cmake.in b/third-party/tbb/cmake/templates/TBBConfig.cmake.in
index 18ac68d3..3131e3dd 100644
--- a/third-party/tbb/cmake/templates/TBBConfig.cmake.in
+++ b/third-party/tbb/cmake/templates/TBBConfig.cmake.in
@@ -65,6 +65,7 @@ foreach (_tbb_component ${TBB_FIND_COMPONENTS})
         NAMES @TBB_LIB_PREFIX@${_tbb_component}${_bin_version}.@TBB_LIB_EXT@
         PATHS ${_tbb_root}
         PATH_SUFFIXES "@TBB_LIB_REL_PATH@/${_tbb_intel_arch}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}" "@TBB_LIB_REL_PATH@"
+        NO_DEFAULT_PATH
     )
 
     if (NOT TBB_FIND_RELEASE_ONLY)
@@ -72,6 +73,7 @@ foreach (_tbb_component ${TBB_FIND_COMPONENTS})
             NAMES @TBB_LIB_PREFIX@${_tbb_component}${_bin_version}_debug.@TBB_LIB_EXT@
             PATHS ${_tbb_root}
             PATH_SUFFIXES "@TBB_LIB_REL_PATH@/${_tbb_intel_arch}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}" "@TBB_LIB_REL_PATH@"
+            NO_DEFAULT_PATH
         )
     endif()
 
diff --git a/third-party/tbb/cmake/utils.cmake b/third-party/tbb/cmake/utils.cmake
index 982a633f..21101989 100644
--- a/third-party/tbb/cmake/utils.cmake
+++ b/third-party/tbb/cmake/utils.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,31 +23,37 @@ macro(tbb_remove_compile_flag flag)
 endmacro()
 
 macro(tbb_install_target target)
-    install(TARGETS ${target}
-        EXPORT TBBTargets
-        LIBRARY
-            DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            NAMELINK_SKIP
-            COMPONENT runtime
-        RUNTIME
-            DESTINATION ${CMAKE_INSTALL_BINDIR}
-            COMPONENT runtime
-        ARCHIVE
-            DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            COMPONENT devel)
-
-    if (BUILD_SHARED_LIBS)
+    if (TBB_INSTALL)
         install(TARGETS ${target}
+            EXPORT TBBTargets
             LIBRARY
                 DESTINATION ${CMAKE_INSTALL_LIBDIR}
-                NAMELINK_ONLY
-                COMPONENT devel)
-    endif()
-    if (MSVC AND BUILD_SHARED_LIBS)
-        install(FILES $<TARGET_PDB_FILE:${target}>
-            DESTINATION ${CMAKE_INSTALL_BINDIR}
-            COMPONENT devel
-            OPTIONAL)
+                NAMELINK_SKIP
+                COMPONENT runtime
+            RUNTIME
+                DESTINATION ${CMAKE_INSTALL_BINDIR}
+                COMPONENT runtime
+            ARCHIVE
+                DESTINATION ${CMAKE_INSTALL_LIBDIR}
+                COMPONENT devel
+            FRAMEWORK
+                DESTINATION ${CMAKE_INSTALL_LIBDIR}
+                COMPONENT runtime
+                OPTIONAL)
+
+        if (BUILD_SHARED_LIBS)
+            install(TARGETS ${target}
+                LIBRARY
+                    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+                    NAMELINK_ONLY
+                    COMPONENT devel)
+        endif()
+        if (MSVC AND BUILD_SHARED_LIBS)
+            install(FILES $<TARGET_PDB_FILE:${target}>
+                DESTINATION ${CMAKE_INSTALL_BINDIR}
+                COMPONENT devel
+                OPTIONAL)
+        endif()
     endif()
 endmacro()
 
diff --git a/third-party/tbb/cmake/vars_utils.cmake b/third-party/tbb/cmake/vars_utils.cmake
index 989fea26..54a9fda1 100644
--- a/third-party/tbb/cmake/vars_utils.cmake
+++ b/third-party/tbb/cmake/vars_utils.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,12 +26,20 @@ get_filename_component(TBB_VARS_TEMPLATE_NAME ${PROJECT_SOURCE_DIR}/integration/
 string(REPLACE ".in" "" TBB_VARS_NAME ${TBB_VARS_TEMPLATE_NAME})
 
 macro(tbb_gen_vars target)
+    if (NOT TBB_BUILD_APPLE_FRAMEWORKS)
+        set(BIN_PATH $<TARGET_FILE_DIR:${target}>)
+    else()
+        # For Apple* frameworks, the binaries are placed in a framework bundle. 
+        # When using an Apple* framework, you refer to the bundle, not the binary inside, so we take the bundle's path and go up one level.
+        # This path will then be used to generate the vars file, and the contents of the vars file will use the bundle's parent directory.
+        set(BIN_PATH $<TARGET_BUNDLE_DIR:${target}>/..)
+    endif()
     if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
         add_custom_command(TARGET ${target} POST_BUILD COMMAND
             ${CMAKE_COMMAND}
             -DBINARY_DIR=${CMAKE_BINARY_DIR}
             -DSOURCE_DIR=${PROJECT_SOURCE_DIR}
-            -DBIN_PATH=$<TARGET_FILE_DIR:${target}>
+            -DBIN_PATH=${BIN_PATH}
             -DVARS_TEMPLATE=${TBB_VARS_TEMPLATE}
             -DVARS_NAME=${TBB_VARS_NAME}
             -DTBB_INSTALL_VARS=${TBB_INSTALL_VARS}
diff --git a/third-party/tbb/doc/conf.py b/third-party/tbb/doc/conf.py
index 87593ebf..19da0a4c 100644
--- a/third-party/tbb/doc/conf.py
+++ b/third-party/tbb/doc/conf.py
@@ -137,10 +137,14 @@
         'use_issues_button': True,
         'use_edit_page_button': True,
         'repository_branch': 'master',
-        'extra_footer': '<p align="right"><a href="https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html">Cookies</a></p>'
     }
 
+if BUILD_TYPE != 'oneapi' and BUILD_TYPE != 'dita':
+   html_theme_options = {
+    "extra_footer": "<div><a href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html' data-cookie-notice='true'>Cookies</a> <a href='https://www.intel.com/content/www/us/en/privacy/intel-privacy-notice.html'>| Privacy</a> <a data-wap_ref='dns' id='wap_dns' href='https://www.intel.com/content/www/us/en/privacy/intel-cookie- notice.html'>| Do Not Share My Personal Information</a> </div><div>&copy; Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), <a href='http://opensource.org/licenses/0BSD'>http://opensource.org/licenses/0BSD</a>. </div><br><div>oneTBB is licensed under Apache License Version 2.0. Refer to the <a href='https://github.com/oneapi-src/oneTBB/blob/master/LICENSE.txt'>LICENSE </a> file for the full license text and copyright notice.</div>"
+   }
 
+    
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
diff --git a/third-party/tbb/doc/main/intro/Benefits.rst b/third-party/tbb/doc/main/intro/Benefits.rst
index b66ea5d1..5058cc71 100644
--- a/third-party/tbb/doc/main/intro/Benefits.rst
+++ b/third-party/tbb/doc/main/intro/Benefits.rst
@@ -20,7 +20,7 @@ it with any compiler supporting ISO C++. The library differs from
 typical threading packages in the following ways:
 
 
--  **oneTBB enables you to specify logical paralleism instead of
+-  **oneTBB enables you to specify logical parallelism instead of
    threads**. Most threading packages require you to specify threads.
    Programming directly in terms of threads can be tedious and lead to
    inefficient programs, because threads are low-level, heavy constructs
diff --git a/third-party/tbb/doc/main/reference/reference.rst b/third-party/tbb/doc/main/reference/reference.rst
index ec9fb1e1..833a50ee 100644
--- a/third-party/tbb/doc/main/reference/reference.rst
+++ b/third-party/tbb/doc/main/reference/reference.rst
@@ -19,6 +19,7 @@ It also describes features that are not included in the oneTBB specification.
     parallel_for_each_semantics
     parallel_sort_ranges_extension
     scalable_memory_pools/malloc_replacement_log
+    rvalue_reduce
 
 Preview features
 ****************
diff --git a/third-party/tbb/doc/main/reference/rvalue_reduce.rst b/third-party/tbb/doc/main/reference/rvalue_reduce.rst
new file mode 100644
index 00000000..53880952
--- /dev/null
+++ b/third-party/tbb/doc/main/reference/rvalue_reduce.rst
@@ -0,0 +1,89 @@
+.. _rvalue_reduce:
+
+Parallel Reduction for rvalues
+==============================
+
+.. contents::
+    :local:
+    :depth: 1
+
+Description
+***********
+
+|full_name| implementation extends the `ParallelReduceFunc <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_func.html>`_ and
+`ParallelReduceReduction <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_reduction.html>`_
+to optimize operating with ``rvalues`` using functional form of ``tbb::parallel_reduce`` and ``tbb::parallel_deterministic_reduce`` algorithms.
+
+API
+***
+
+Header
+------
+
+.. code:: cpp
+
+    #include <oneapi/tbb/parallel_reduce.h>
+
+ParallelReduceFunc Requirements: Pseudo-Signature, Semantics
+------------------------------------------------------------
+
+.. cpp:function:: Value Func::operator()(const Range& range, Value&& x) const
+
+or
+
+.. cpp:function:: Value Func::operator()(const Range& range, const Value& x) const
+
+    Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the `Range requirements <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/range.html>_`.
+    The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_.
+
+    If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred.
+
+ParallelReduceReduction Requirements: Pseudo-Signature, Semantics
+-----------------------------------------------------------------
+
+.. cpp:function:: Value Reduction::operator()(Value&& x, Value&& y) const
+
+or
+
+.. cpp:function:: Value Reduction::operator()(const Value& x, const Value& y) const
+
+    Combines the ``x`` and ``y`` results. The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_.
+
+    If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred.
+
+Example
+*******
+
+.. code:: cpp
+    // C++17
+    #include <oneapi/tbb/parallel_reduce.h>
+    #include <oneapi/tbb/blocked_range.h>
+    #include <vector>
+    #include <set>
+
+    int main() {
+        std::vector<std::set<int>> sets = ...;
+
+        oneapi::tbb::parallel_reduce(oneapi::tbb::blocked_range<size_t>(0, sets.size()),
+                                     std::set<int>{}, // identity element - empty set
+                                     [&](const oneapi::tbb::blocked_range<size_t>& range, std::set<int>&& value) {
+                                         for (size_t i = range.begin(); i < range.end(); ++i) {
+                                             // Having value as a non-const rvalue reference allows to efficiently
+                                             // transfer nodes from sets[i] without copying/moving the data
+                                             value.merge(std::move(sets[i]));
+                                         }
+                                         return value;
+                                     },
+                                     [&](std::set<int>&& x, std::set<int>&& y) {
+                                         x.merge(std::move(y));
+                                         return x;
+                                     }
+                                     );
+    }
+
+.. rubric:: See also
+
+* `oneapi::tbb::parallel_reduce specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_
+* `oneapi::tbb::parallel_deterministic_reduce specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_deterministic_reduce_func.html>`_
+* `ParallelReduceFunc specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_func.html>`_
+* `ParallelReduceReduction specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_reduction.html>`_
diff --git a/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst b/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst
index 8487c449..44fc2f0a 100644
--- a/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst
@@ -63,7 +63,7 @@ messages and do not support ``try_get()`` or ``try_reserve()``.
        broadcast_node<int> bn(g);
        buffer_node<int> buf1(g);
        buffer_node<int> buf2(g);
-       typedef join_node<tuple<int,int> reserving> join_type;
+       typedef join_node<tuple<int,int>, reserving> join_type;
        join_type jn(g);
        buffer_node<join_type::output_type> buf_out(g);
        join_type::output_type tuple_out;
@@ -71,9 +71,9 @@ messages and do not support ``try_get()`` or ``try_reserve()``.
 
 
        // join_node predecessors are both reservable buffer_nodes
-       make_edge(buf1,input_port<0>jn));
-       make_edge(bn,input_port<0>jn));      // attach a broadcast_node
-       make_edge(buf2,input_port<1>jn));
+       make_edge(buf1,input_port<0>(jn));
+       make_edge(bn,input_port<0>(jn));      // attach a broadcast_node
+       make_edge(buf2,input_port<1>(jn));
        make_edge(jn, buf_out);
        bn.try_put(2);
        buf1.try_put(3);
@@ -81,7 +81,7 @@ messages and do not support ``try_get()`` or ``try_reserve()``.
        buf2.try_put(7);
        g.wait_for_all();
        while (buf_out.try_get(tuple_out)) {
-           printf("join_node output == (%d,%d)\n",get<0>tuple_out), get<1>tuple_out) );
+           printf("join_node output == (%d,%d)\n",get<0>(tuple_out), get<1>(tuple_out) );
        }
        if(buf1.try_get(icnt)) printf("buf1 had %d\n", icnt);
        else printf("buf1 was empty\n");
diff --git a/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst b/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst
index 57582aac..8d467fb6 100644
--- a/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst
@@ -46,3 +46,4 @@ TBB possible output:
     TBB: RML	private
     TBB: Tools support	disabled
 
+.. note:: The ``tbbmalloc`` library in oneTBB is fully binary compatible with TBB. 
diff --git a/third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst b/third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst
index cd8482ff..8d9ba3a1 100644
--- a/third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst
@@ -30,14 +30,14 @@ string occurs in the array ``Data``.
 
    // Structure that defines hashing and comparison operations for user's type.
    struct MyHashCompare {
-       static size_t hash( const string& x ) {
+       size_t hash( const string& x ) const {
            size_t h = 0;
            for( const char* s = x.c_str(); *s; ++s )
                h = (h*17)^*s;
            return h;
        }
        //! True if strings are equal
-       static bool equal( const string& x, const string& y ) {
+       bool equal( const string& x, const string& y ) const {
            return x==y;
        }
    };
@@ -128,4 +128,4 @@ any other extant accesses on ``key``.
 .. toctree::
    :maxdepth: 4
 
-   ../tbb_userguide/More_on_HashCompare
\ No newline at end of file
+   ../tbb_userguide/More_on_HashCompare
diff --git a/third-party/tbb/doc/make.bat b/third-party/tbb/doc/make.bat
index 557ecc5b..14d399a5 100644
--- a/third-party/tbb/doc/make.bat
+++ b/third-party/tbb/doc/make.bat
@@ -25,7 +25,7 @@ REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
-set SOURCEDIR=doc
+set SOURCEDIR=.
 set BUILDDIR=build
 
 if "%1" == "" goto help
diff --git a/third-party/tbb/examples/CMakeLists.txt b/third-party/tbb/examples/CMakeLists.txt
index 979998c6..16f1c455 100644
--- a/third-party/tbb/examples/CMakeLists.txt
+++ b/third-party/tbb/examples/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(tbb_examples CXX)
 
@@ -66,6 +66,7 @@ tbb_add_example(parallel_for_each parallel_preorder)
 tbb_add_example(parallel_pipeline square)
 
 tbb_add_example(parallel_reduce convex_hull)
+tbb_add_example(parallel_reduce pi)
 tbb_add_example(parallel_reduce primes)
 
 tbb_add_example(task_arena fractal)
diff --git a/third-party/tbb/examples/README.md b/third-party/tbb/examples/README.md
index 318d2d93..037ca4d4 100644
--- a/third-party/tbb/examples/README.md
+++ b/third-party/tbb/examples/README.md
@@ -19,6 +19,7 @@ This directory contains example usages of oneAPI Threading Building Blocks.
 | parallel_for_each/parallel_preorder | Parallel preorder traversal of a graph.
 | parallel_pipeline/square | Another string transformation example that squares numbers read from a file.
 | parallel_reduce/convex_hull | Parallel version of convex hull algorithm (quick hull).
+| parallel_reduce/pi | Parallel version of calculating &pi; by numerical integration.
 | parallel_reduce/primes | Parallel version of the Sieve of Eratosthenes.
 | task_arena/fractal |The example calculates two classical Mandelbrot fractals with different concurrency limits.
 | task_group/sudoku | Compute all solutions for a Sudoku board.
diff --git a/third-party/tbb/examples/common/gui/CMakeLists.txt b/third-party/tbb/examples/common/gui/CMakeLists.txt
index 8bee0a83..ea8b0060 100644
--- a/third-party/tbb/examples/common/gui/CMakeLists.txt
+++ b/third-party/tbb/examples/common/gui/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 set(EXAMPLES_UI_MODE "con" CACHE STRING "EXAMPLES_UI_MODE")
 
diff --git a/third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt b/third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt
index 14d25fa7..77efd2f6 100644
--- a/third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt
+++ b/third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(count_strings CXX)
 
diff --git a/third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp b/third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp
index 2b563cd5..0a230846 100644
--- a/third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp
+++ b/third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ class hash<std::basic_string<CharT, Traits, Allocator>> {
         (sizeof(std::size_t) == sizeof(unsigned)) ? 2654435769U : 11400714819323198485ULL);
 
     std::hash<CharT> char_hash;
-}; // strunt hash<std::basic_string>
+}; // struct hash<std::basic_string>
 
 } // namespace std
 
diff --git a/third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt b/third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt
index 8a6d78a0..624a5928 100644
--- a/third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt
+++ b/third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(shortpath CXX)
 
diff --git a/third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt b/third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt
index cf4e6a1b..91792dde 100644
--- a/third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt
+++ b/third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(sub_string_finder_simple CXX)
 project(sub_string_finder_extended CXX)
diff --git a/third-party/tbb/examples/graph/binpack/CMakeLists.txt b/third-party/tbb/examples/graph/binpack/CMakeLists.txt
index 5fc979a5..3d3b7921 100644
--- a/third-party/tbb/examples/graph/binpack/CMakeLists.txt
+++ b/third-party/tbb/examples/graph/binpack/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(binpack CXX)
 
diff --git a/third-party/tbb/examples/graph/cholesky/CMakeLists.txt b/third-party/tbb/examples/graph/cholesky/CMakeLists.txt
index eeb2649a..2e8273ae 100644
--- a/third-party/tbb/examples/graph/cholesky/CMakeLists.txt
+++ b/third-party/tbb/examples/graph/cholesky/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(cholesky CXX)
 
diff --git a/third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt b/third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt
index 95f7a483..d46af59b 100644
--- a/third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt
+++ b/third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(dining_philosophers CXX)
 
diff --git a/third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt b/third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt
index a2034edb..7a9142a5 100644
--- a/third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt
+++ b/third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(fgbzip2 CXX)
 
diff --git a/third-party/tbb/examples/graph/logic_sim/CMakeLists.txt b/third-party/tbb/examples/graph/logic_sim/CMakeLists.txt
index b33f9156..99e1cc8f 100644
--- a/third-party/tbb/examples/graph/logic_sim/CMakeLists.txt
+++ b/third-party/tbb/examples/graph/logic_sim/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(logic_sim CXX)
 
diff --git a/third-party/tbb/examples/graph/som/CMakeLists.txt b/third-party/tbb/examples/graph/som/CMakeLists.txt
index 6e759331..c2dd1a80 100644
--- a/third-party/tbb/examples/graph/som/CMakeLists.txt
+++ b/third-party/tbb/examples/graph/som/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 include(../../common/cmake/common.cmake)
 project(som CXX)
diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt b/third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt
index 5032da23..57e027cf 100644
--- a/third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt
+++ b/third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(recursive_fibonacci CXX)
 
@@ -33,7 +33,7 @@ set(EXECUTABLE "$<TARGET_FILE:recursive_fibonacci>")
 # `N` - specifies the fibonacci number which would be calculated.
 # `C` - cutoff that will be used to stop recursive split.
 # `I` - number of iteration to measure benchmark time.
-set(ARGS 30 16 20)
+set(ARGS 30 16 20 1)
 set(PERF_ARGS 50 5 20)
 
 add_execution_target(run_recursive_fibonacci recursive_fibonacci ${EXECUTABLE} "${ARGS}")
diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/README.md b/third-party/tbb/examples/migration/recursive_fibonacci/README.md
index bc66c5d8..1f0341c1 100644
--- a/third-party/tbb/examples/migration/recursive_fibonacci/README.md
+++ b/third-party/tbb/examples/migration/recursive_fibonacci/README.md
@@ -9,14 +9,15 @@ cmake --build .
 
 ## Running the sample
 ### Predefined make targets
-* `make run_recursive_fibonacci` - executes the example with predefined parameters.
+* `make run_recursive_fibonacci` - executes the example with predefined parameters (extended testing enabled).
 * `make perf_run_recursive_fibonacci` - executes the example with suggested parameters to measure the oneTBB performance.
 
 ### Application parameters
 Usage:
 ```
-recursive_fibonacci N C I
+recursive_fibonacci N C I T
 ```
 * `N` - specifies the fibonacci number which would be calculated.
 * `C` - cutoff that will be used to stop recursive split.
 * `I` - number of iteration to measure benchmark time.
+* `T` - enables extended testing (recycle task in a loop).
diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp
index acf22a49..e4a7c12e 100644
--- a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp
+++ b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp
@@ -22,6 +22,7 @@
 #include <utility>
 
 int cutoff;
+bool testing_enabled;
 
 template <typename F>
 std::pair</* result */ unsigned long, /* time */ unsigned long> measure(F&& f,
@@ -48,6 +49,7 @@ int main(int argc, char* argv[]) {
     int numbers = argc > 1 ? strtol(argv[1], nullptr, 0) : 50;
     cutoff = argc > 2 ? strtol(argv[2], nullptr, 0) : 16;
     unsigned long ntrial = argc > 3 ? (unsigned long)strtoul(argv[3], nullptr, 0) : 20;
+    testing_enabled = argc > 4 ? (bool)strtol(argv[4], nullptr, 0) : false;
 
     auto res = measure(fibonacci_two_tasks, numbers, ntrial);
     std::cout << "Fibonacci two tasks impl N = " << res.first << " Avg time = " << res.second
diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h
index 2467f862..dae8895b 100644
--- a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h
+++ b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h
@@ -24,6 +24,7 @@
 #include <utility>
 
 extern int cutoff;
+extern bool testing_enabled;
 
 long serial_fib_1(int n) {
     return n < 2 ? n : serial_fib_1(n - 1) + serial_fib_1(n - 2);
@@ -38,39 +39,43 @@ struct single_fib_task : task_emulation::base_task {
     single_fib_task(int n, int* x) : n(n), x(x), s(state::compute)
     {}
 
-    void execute() override {
+    task_emulation::base_task* execute() override {
+        task_emulation::base_task* bypass = nullptr;
         switch (s) {
             case state::compute : {
-                compute_impl();
+                bypass = compute_impl();
                 break;
             }
             case state::sum : {
                 *x = x_l + x_r;
+
+                if (testing_enabled) {
+                    if (n == cutoff && num_recycles > 0) {
+                        --num_recycles;
+                        bypass = compute_impl();
+                    }
+                }
+
                 break;
             }
         }
+        return bypass;
     }
 
-    void compute_impl() {
+    task_emulation::base_task* compute_impl() {
+        task_emulation::base_task* bypass = nullptr;
         if (n < cutoff) {
             *x = serial_fib_1(n);
         }
         else {
-            auto bypass = this->allocate_child_and_increment<single_fib_task>(n - 2, &x_r);
+            bypass = this->allocate_child_and_increment<single_fib_task>(n - 2, &x_r);
             task_emulation::run_task(this->allocate_child_and_increment<single_fib_task>(n - 1, &x_l));
 
             // Recycling
             this->s = state::sum;
             this->recycle_as_continuation();
-
-            // Bypass is not supported by task_emulation and next_task executed directly.
-            // However, the old-TBB bypass behavior can be achieved with
-            // `return task_group::defer()` (check Migration Guide).
-            // Consider submit another task if recursion call is not acceptable
-            // i.e. instead of Direct Body call
-            // submit task_emulation::run_task(this->allocate_child_and_increment<single_fib_task>(n - 2, &x_r));
-            bypass->operator()();
         }
+        return bypass;
     }
 
 
@@ -79,6 +84,7 @@ struct single_fib_task : task_emulation::base_task {
     state s;
 
     int x_l{ 0 }, x_r{ 0 };
+    int num_recycles{5};
 };
 
 int fibonacci_single_task(int n) {
diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h
index 91236625..5d7fd022 100644
--- a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h
+++ b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h
@@ -33,8 +33,9 @@ long serial_fib(int n) {
 struct fib_continuation : task_emulation::base_task {
     fib_continuation(int& s) : sum(s) {}
 
-    void execute() override {
+    task_emulation::base_task* execute() override {
         sum = x + y;
+        return nullptr;
     }
 
     int x{ 0 }, y{ 0 };
@@ -44,7 +45,8 @@ struct fib_continuation : task_emulation::base_task {
 struct fib_computation : task_emulation::base_task {
     fib_computation(int n, int* x) : n(n), x(x) {}
 
-    void execute() override {
+    task_emulation::base_task* execute() override {
+        task_emulation::base_task* bypass = nullptr;
         if (n < cutoff) {
             *x = serial_fib(n);
         }
@@ -57,15 +59,9 @@ struct fib_computation : task_emulation::base_task {
             this->recycle_as_child_of(c);
             n = n - 2;
             x = &c.y;
-
-            // Bypass is not supported by task_emulation and next_task executed directly.
-            // However, the old-TBB bypass behavior can be achieved with
-            // `return task_group::defer()` (check Migration Guide).
-            // Consider submit another task if recursion call is not acceptable
-            // i.e. instead of Recycling + Direct Body call
-            // submit task_emulation::run_task(c.create_child<fib_computation>(n - 2, &c.y));
-            this->operator()();
+            bypass = this;
         }
+        return bypass;
     }
 
     int n;
diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h b/third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h
index 3a387127..7252d447 100644
--- a/third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h
+++ b/third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h
@@ -47,32 +47,45 @@ class base_task {
 public:
     base_task() = default;
 
-    base_task(const base_task& t) : m_parent(t.m_parent), m_child_counter(t.m_child_counter.load())
+    base_task(const base_task& t) : m_type(t.m_type), m_parent(t.m_parent), m_child_counter(t.m_child_counter.load())
     {}
 
     virtual ~base_task() = default;
 
     void operator() () const {
-        base_task* parent_snapshot = m_parent;
-        const_cast<base_task*>(this)->execute();
-        if (m_parent && parent_snapshot == m_parent && m_child_counter == 0) {
-            if (m_parent->remove_reference() == 0) {
+        task_type type_snapshot = m_type;
+
+        base_task* bypass = const_cast<base_task*>(this)->execute();
+
+        if (m_parent && m_type != task_type::recycled) {
+            if (m_parent->remove_child_reference() == 0) {
                 m_parent->operator()();
-                delete m_parent;
             }
         }
 
-        if (m_child_counter == 0 && m_type == task_type::allocated) {
+        if (m_type == task_type::allocated) {
             delete this;
         }
+
+        if (bypass != nullptr) {
+            m_type = type_snapshot;
+
+            // Bypass is not supported by task_emulation and next_task executed directly.
+            // However, the old-TBB bypass behavior can be achieved with
+            // `return task_group::defer()` (check Migration Guide).
+            // Consider submit another task if recursion call is not acceptable
+            // i.e. instead of Direct Body call
+            // submit task_emulation::run_task();
+            bypass->operator()();
+        }
     }
 
-    virtual void execute() = 0;
+    virtual base_task* execute() = 0;
 
     template <typename C, typename... Args>
     C* allocate_continuation(std::uint64_t ref, Args&&... args) {
         C* continuation = new C{std::forward<Args>(args)...};
-        continuation->m_type = task_type::continuation;
+        continuation->m_type = task_type::allocated;
         continuation->reset_parent(reset_parent());
         continuation->m_child_counter = ref;
         return continuation;
@@ -85,7 +98,7 @@ class base_task {
 
     template <typename F, typename... Args>
     F create_child_and_increment(Args&&... args) {
-        add_reference();
+        add_child_reference();
         return create_child_impl<F>(std::forward<Args>(args)...);
     }
 
@@ -96,35 +109,36 @@ class base_task {
 
     template <typename F, typename... Args>
     F* allocate_child_and_increment(Args&&... args) {
-        add_reference();
+        add_child_reference();
         return allocate_child_impl<F>(std::forward<Args>(args)...);
     }
 
     template <typename C>
     void recycle_as_child_of(C& c) {
+        m_type = task_type::recycled;
         reset_parent(&c);
     }
 
     void recycle_as_continuation() {
-        m_type = task_type::continuation;
+        m_type = task_type::recycled;
     }
 
-    void add_reference() {
+    void add_child_reference() {
         ++m_child_counter;
     }
 
-    std::uint64_t remove_reference() {
+    std::uint64_t remove_child_reference() {
         return --m_child_counter;
     }
 
 protected:
     enum class task_type {
-        created,
+        stack_based,
         allocated,
-        continuation
+        recycled
     };
 
-    task_type m_type;
+    mutable task_type m_type;
 
 private:
     template <typename F, typename... Args>
@@ -136,7 +150,7 @@ class base_task {
     template <typename F, typename... Args>
     F create_child_impl(Args&&... args) {
         F obj{std::forward<Args>(args)...};
-        obj.m_type = task_type::created;
+        obj.m_type = task_type::stack_based;
         obj.reset_parent(this);
         return obj;
     }
@@ -162,13 +176,14 @@ class base_task {
 class root_task : public base_task {
 public:
     root_task(tbb::task_group& tg) : m_tg(tg), m_callback(m_tg.defer([] { /* Create empty callback to preserve reference for wait. */})) {
-        add_reference();
-        m_type = base_task::task_type::continuation;
+        add_child_reference();
+        m_type = base_task::task_type::allocated;
     }
 
 private:
-    void execute() override {
+    base_task* execute() override {
         m_tg.run(std::move(m_callback));
+        return nullptr;
     }
 
     tbb::task_group& m_tg;
@@ -178,7 +193,7 @@ class root_task : public base_task {
 template <typename F, typename... Args>
 F create_root_task(tbb::task_group& tg, Args&&... args) {
     F obj{std::forward<Args>(args)...};
-    obj.m_type = base_task::task_type::created;
+    obj.m_type = base_task::task_type::stack_based;
     obj.reset_parent(new root_task{tg});
     return obj;
 }
diff --git a/third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt b/third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt
index 47f7ca7b..59634242 100644
--- a/third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt
+++ b/third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(game_of_life CXX)
 
diff --git a/third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt b/third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt
index cb0475e2..a45aaa68 100644
--- a/third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt
+++ b/third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(polygon_overlay CXX)
 
diff --git a/third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt b/third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt
index 9236176b..61675f19 100644
--- a/third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt
+++ b/third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(seismic CXX)
 
diff --git a/third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt b/third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt
index 9dc0f83c..752fddef 100644
--- a/third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt
+++ b/third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(tachyon CXX)
 
@@ -39,7 +39,6 @@ add_executable(
     src/imageio.cpp
     src/imap.cpp
     src/intersect.cpp
-    src/jpeg.cpp
     src/light.cpp
     src/objbound.cpp
     src/parse.cpp
diff --git a/third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp b/third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp
index 30c61d7f..c1c9d762 100644
--- a/third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp
+++ b/third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -59,7 +59,6 @@
 #include "imageio.hpp"
 #include "ppm.hpp" /* PPM files */
 #include "tgafile.hpp" /* Truevision Targa files */
-#include "jpeg.hpp" /* JPEG files */
 
 static int fakeimage(char *name, int *xres, int *yres, unsigned char **imgdata) {
     int i, imgsize;
@@ -90,7 +89,7 @@ int readimage(rawimage *img) {
         rc = readtga(name, &xres, &yres, &imgdata);
     }
     else if (strstr(name, ".jpg")) {
-        rc = readjpeg(name, &xres, &yres, &imgdata);
+        rc = IMAGEUNSUP;
     }
     else if (strstr(name, ".gif")) {
         rc = IMAGEUNSUP;
diff --git a/third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt b/third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt
index 235604ab..8e98d360 100644
--- a/third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt
+++ b/third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(parallel_preorder CXX)
 
diff --git a/third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt b/third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt
index a32eaaf8..184c787e 100644
--- a/third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt
+++ b/third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(square CXX)
 
diff --git a/third-party/tbb/examples/parallel_reduce/README.md b/third-party/tbb/examples/parallel_reduce/README.md
index 481d8e18..0dba80ca 100644
--- a/third-party/tbb/examples/parallel_reduce/README.md
+++ b/third-party/tbb/examples/parallel_reduce/README.md
@@ -4,4 +4,5 @@ Examples using `parallel_reduce` algorithm.
 | Code sample name | Description
 |:--- |:---
 | convex_hull | Parallel version of convex hull algorithm (quick hull).
+| pi | Parallel version of calculating &pi; by numerical integration.
 | primes | Parallel version of the Sieve of Eratosthenes.
diff --git a/third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt b/third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt
index de32d1de..0492244a 100644
--- a/third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt
+++ b/third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(convex_hull_bench CXX)
 project(convex_hull_sample CXX)
diff --git a/third-party/tbb/examples/parallel_reduce/pi/CMakeLists.txt b/third-party/tbb/examples/parallel_reduce/pi/CMakeLists.txt
new file mode 100644
index 00000000..62ebe022
--- /dev/null
+++ b/third-party/tbb/examples/parallel_reduce/pi/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.5)
+
+project(pi CXX)
+
+include(../../common/cmake/common.cmake)
+
+set_common_project_settings(tbb)
+
+add_executable(pi main.cpp pi.cpp)
+
+target_link_libraries(pi TBB::tbb Threads::Threads)
+target_compile_options(pi PRIVATE ${TBB_CXX_STD_FLAG})
+
+set(EXECUTABLE "$<TARGET_FILE:pi>")
+set(ARGS "")
+set(PERF_ARGS auto 100000000000)
+
+add_execution_target(run_pi pi ${EXECUTABLE} "${ARGS}")
+add_execution_target(perf_run_pi pi ${EXECUTABLE} "${PERF_ARGS}")
diff --git a/third-party/tbb/examples/parallel_reduce/pi/README.md b/third-party/tbb/examples/parallel_reduce/pi/README.md
new file mode 100644
index 00000000..be7ce0d4
--- /dev/null
+++ b/third-party/tbb/examples/parallel_reduce/pi/README.md
@@ -0,0 +1,24 @@
+# Pi Sample
+Parallel version of calculating &pi; by numerical integration.
+
+## Build
+To build the sample, run the following commands:
+```
+cmake <path_to_example>
+cmake --build .
+```
+
+## Run
+### Predefined Make Targets
+* `make run_pi` - executes the example with predefined parameters
+* `make perf_run_pi` - executes the example with suggested parameters to measure the oneTBB performance
+
+### Application Parameters
+You can use the following application parameters:
+```
+pi [n-of-threads=value] [n-of-intervals=value] [silent] [-h] [n-of-threads [n-of-intervals]]
+```
+* `-h` - prints the help for command-line options.
+* `n-of-threads` - the number of threads to use. This number is specified in the low\[:high\] range format, where both ``low`` and, optionally, ``high`` are non-negative integers. You can also use ``auto`` to let the system choose a default number of threads suitable for the platform.
+* `n-of-intervals` - the number of intervals to subdivide into. Must be a positive integer.
+* `silent` - no output except the elapsed time.
diff --git a/third-party/tbb/examples/parallel_reduce/pi/common.h b/third-party/tbb/examples/parallel_reduce/pi/common.h
new file mode 100644
index 00000000..0e316854
--- /dev/null
+++ b/third-party/tbb/examples/parallel_reduce/pi/common.h
@@ -0,0 +1,51 @@
+/*
+    Copyright (c) 2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef TBB_examples_pi_H
+#define TBB_examples_pi_H
+
+#include <cstdlib>
+
+typedef std::size_t number_t;
+typedef double pi_t;
+
+extern const number_t chunk_size;
+extern number_t num_intervals;
+extern pi_t step;
+
+extern bool silent;
+
+inline pi_t pi_kernel(number_t i) {
+    pi_t dx = (pi_t(i) + pi_t(0.5)) * step;
+    return pi_t(4.0) / (pi_t(1.0) + dx * dx);
+}
+
+inline double pi_slice_kernel(number_t slice, number_t slice_size = chunk_size) {
+    pi_t pi = pi_t(0.0);
+    for (number_t i = slice; i < slice + slice_size; ++i) {
+        pi += pi_kernel(i);
+    }
+    return pi;
+}
+
+struct threading {
+    threading(int p);
+    ~threading();
+};
+
+double compute_pi_parallel();
+
+#endif //  TBB_examples_pi_H
diff --git a/third-party/tbb/examples/parallel_reduce/pi/main.cpp b/third-party/tbb/examples/parallel_reduce/pi/main.cpp
new file mode 100644
index 00000000..81690617
--- /dev/null
+++ b/third-party/tbb/examples/parallel_reduce/pi/main.cpp
@@ -0,0 +1,100 @@
+/*
+    Copyright (c) 2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "oneapi/tbb/tick_count.h"
+
+#include "common/utility/get_default_num_threads.hpp"
+#include "common/utility/utility.hpp"
+
+#include "common.h"
+
+const number_t chunk_size = 4096; // Multiple of 16, to fit float datatype to a vector register.
+
+// number of intervals
+number_t num_intervals = 1000000000;
+pi_t step = pi_t(0.0);
+
+bool silent = false;
+
+double compute_pi_serial() {
+    double ret = 0;
+
+    step = pi_t(1.0) / num_intervals;
+
+    number_t tail = num_intervals % chunk_size;
+    number_t last = num_intervals - tail;
+
+    for (number_t slice = 0; slice < last; slice += chunk_size) {
+        ret += pi_slice_kernel(slice);
+    }
+    ret += pi_slice_kernel(last, tail);
+    ret *= step;
+
+    return ret;
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        tbb::tick_count main_start_time = tbb::tick_count::now();
+        // zero number of threads means to run serial version
+        utility::thread_number_range threads(utility::get_default_num_threads, 0);
+
+        utility::parse_cli_arguments(
+            argc,
+            argv,
+            utility::cli_argument_pack()
+                //"-h" option for for displaying help is present implicitly
+                .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc)
+                .positional_arg(num_intervals, "n-of-intervals", "number of intervals")
+                .arg(silent, "silent", "no output except time elapsed"));
+
+        for (int p = threads.first; p <= threads.last; p = threads.step(p)) {
+            pi_t pi;
+            double compute_time;
+            if (p == 0) {
+                //run a serial version
+                tbb::tick_count compute_start_time = tbb::tick_count::now();
+                pi = compute_pi_serial();
+                compute_time = (tbb::tick_count::now() - compute_start_time).seconds();
+            }
+            else {
+                //run a parallel version
+                threading tp(p);
+                tbb::tick_count compute_start_time = tbb::tick_count::now();
+                pi = compute_pi_parallel();
+                compute_time = (tbb::tick_count::now() - compute_start_time).seconds();
+            }
+
+            if (!silent) {
+                if (p == 0) {
+                    std::cout << "Serial run:\tpi = " << pi << "\tcompute time = " << compute_time
+                              << " sec\n";
+                }
+                else {
+                    std::cout << "Parallel run:\tpi = " << pi << "\tcompute time = " << compute_time
+                              << " sec\t on " << p << " threads\n";
+                }
+            }
+        }
+
+        utility::report_elapsed_time((tbb::tick_count::now() - main_start_time).seconds());
+        return 0;
+    }
+    catch (std::exception& e) {
+        std::cerr << "error occurred. error text is :\"" << e.what() << "\"\n";
+        return 1;
+    }
+}
diff --git a/third-party/tbb/examples/parallel_reduce/pi/pi.cpp b/third-party/tbb/examples/parallel_reduce/pi/pi.cpp
new file mode 100644
index 00000000..230752a9
--- /dev/null
+++ b/third-party/tbb/examples/parallel_reduce/pi/pi.cpp
@@ -0,0 +1,55 @@
+/*
+    Copyright (c) 2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "common.h"
+#include "oneapi/tbb/blocked_range.h"
+#include "oneapi/tbb/global_control.h"
+#include "oneapi/tbb/parallel_reduce.h"
+
+struct reduce_body {
+    double my_pi;
+    reduce_body() : my_pi(0) {}
+    reduce_body(reduce_body& x, tbb::split) : my_pi(0) {}
+    void operator()(const tbb::blocked_range<number_t>& r) {
+        my_pi += pi_slice_kernel(r.begin(), r.size());
+    }
+    void join(const reduce_body& y) {
+        my_pi += y.my_pi;
+    }
+};
+
+double compute_pi_parallel() {
+    step = pi_t(1.0) / num_intervals;
+
+    double ret = 0.0;
+
+    reduce_body body;
+    tbb::parallel_reduce(tbb::blocked_range<number_t>(0, num_intervals), body);
+
+    ret = body.my_pi * step;
+
+    return ret;
+}
+
+static std::unique_ptr<tbb::global_control> gc;
+
+threading::threading(int p) {
+    gc.reset(new tbb::global_control(tbb::global_control::max_allowed_parallelism, p));
+}
+
+threading::~threading() {
+    gc.reset();
+}
diff --git a/third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt b/third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt
index dabd9682..987d4656 100644
--- a/third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt
+++ b/third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(primes CXX)
 
diff --git a/third-party/tbb/examples/task_arena/fractal/CMakeLists.txt b/third-party/tbb/examples/task_arena/fractal/CMakeLists.txt
index 888428b3..857dae64 100644
--- a/third-party/tbb/examples/task_arena/fractal/CMakeLists.txt
+++ b/third-party/tbb/examples/task_arena/fractal/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(fractal CXX)
 
diff --git a/third-party/tbb/examples/task_group/sudoku/CMakeLists.txt b/third-party/tbb/examples/task_group/sudoku/CMakeLists.txt
index 5fea9ee6..f514662a 100644
--- a/third-party/tbb/examples/task_group/sudoku/CMakeLists.txt
+++ b/third-party/tbb/examples/task_group/sudoku/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Intel Corporation
+# Copyright (c) 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(sudoku CXX)
 
diff --git a/third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt b/third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt
index 5c97e28a..3b2368e0 100644
--- a/third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt
+++ b/third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021 Intel Corporation
+# Copyright (c) 2019-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 project(fibonacci CXX)
 
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h b/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h
index ade91c33..40829208 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h
@@ -921,7 +921,7 @@ class concurrent_unordered_base {
             node_allocator_traits::deallocate(dummy_node_allocator, node, 1);
         } else {
             // GCC 11.1 issues a warning here that incorrect destructor might be called for dummy_nodes
-            #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 130000 ) && !__clang__ && !__INTEL_COMPILER
+            #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER
             volatile
             #endif
             value_node_ptr val_node = static_cast<value_node_ptr>(node);
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_machine.h b/third-party/tbb/include/oneapi/tbb/detail/_machine.h
index 7a4a1e31..ca481380 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_machine.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_machine.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -96,7 +96,7 @@ static inline void machine_pause(int32_t delay) {
 #if __TBB_x86_64 || __TBB_x86_32
     while (delay-- > 0) { _mm_pause(); }
 #elif __ARM_ARCH_7A__ || __aarch64__
-    while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); }
+    while (delay-- > 0) { __asm__ __volatile__("isb sy" ::: "memory"); }
 #else /* Generic */
     (void)delay; // suppress without including _template_helpers.h
     yield();
diff --git a/third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h b/third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h
index 34bcab68..caa53fa0 100644
--- a/third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h
+++ b/third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -36,7 +36,15 @@
 #include "task.h" // for task::suspend_point
 
 #if _WIN32 || _WIN64
+#ifndef NOMINMAX
+#define NOMINMAX
+#define __TBB_DEFINED_NOMINMAX 1
+#endif
 #include <windows.h>
+#if __TBB_DEFINED_NOMINMAX
+#undef NOMINMAX
+#undef __TBB_DEFINED_NOMINMAX
+#endif
 #else
 #include <pthread.h>
 #endif
diff --git a/third-party/tbb/include/oneapi/tbb/parallel_for.h b/third-party/tbb/include/oneapi/tbb/parallel_for.h
index 91c7c44c..37a26135 100644
--- a/third-party/tbb/include/oneapi/tbb/parallel_for.h
+++ b/third-party/tbb/include/oneapi/tbb/parallel_for.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -319,7 +319,7 @@ void parallel_for_impl(Index first, Index last, Index step, const Function& f, P
 template <typename Index, typename Function>
     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
 void parallel_for(Index first, Index last, Index step, const Function& f) {
-    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
+    parallel_for_impl<Index,Function,const __TBB_DEFAULT_PARTITIONER>(first, last, step, f, __TBB_DEFAULT_PARTITIONER());
 }
 //! Parallel iteration over a range of integers with a step provided and simple partitioner
 template <typename Index, typename Function>
@@ -350,7 +350,7 @@ void parallel_for(Index first, Index last, Index step, const Function& f, affini
 template <typename Index, typename Function>
     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
 void parallel_for(Index first, Index last, const Function& f) {
-    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
+    parallel_for_impl<Index,Function,const __TBB_DEFAULT_PARTITIONER>(first, last, static_cast<Index>(1), f, __TBB_DEFAULT_PARTITIONER());
 }
 //! Parallel iteration over a range of integers with a default step value and simple partitioner
 template <typename Index, typename Function>
@@ -395,7 +395,7 @@ void parallel_for_impl(Index first, Index last, Index step, const Function& f, P
 template <typename Index, typename Function>
     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
 void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) {
-    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
+    parallel_for_impl<Index,Function,const __TBB_DEFAULT_PARTITIONER>(first, last, step, f, __TBB_DEFAULT_PARTITIONER(), context);
 }
 //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
 template <typename Index, typename Function>
@@ -426,7 +426,7 @@ void parallel_for(Index first, Index last, Index step, const Function& f, affini
 template <typename Index, typename Function>
     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
 void parallel_for(Index first, Index last, const Function& f, task_group_context &context) {
-    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
+    parallel_for_impl<Index,Function,const __TBB_DEFAULT_PARTITIONER>(first, last, static_cast<Index>(1), f, __TBB_DEFAULT_PARTITIONER(), context);
 }
 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
 template <typename Index, typename Function>
diff --git a/third-party/tbb/include/oneapi/tbb/parallel_for_each.h b/third-party/tbb/include/oneapi/tbb/parallel_for_each.h
index 56dbeb41..ab0b3453 100644
--- a/third-party/tbb/include/oneapi/tbb/parallel_for_each.h
+++ b/third-party/tbb/include/oneapi/tbb/parallel_for_each.h
@@ -407,6 +407,34 @@ class parallel_for_body_wrapper {
 template<typename It>
 using tag = typename std::iterator_traits<It>::iterator_category;
 
+#if __TBB_CPP20_PRESENT
+template <typename It>
+struct move_iterator_dispatch_helper {
+    using type = It;
+};
+
+// Until C++23, std::move_iterator::iterator_concept always defines
+// to std::input_iterator_tag and hence std::forward_iterator concept
+// always evaluates to false, so std::move_iterator dispatch should be
+// made according to the base iterator type.
+template <typename It>
+struct move_iterator_dispatch_helper<std::move_iterator<It>> {
+    using type = It;
+};
+
+template <typename It>
+using iterator_tag_dispatch_impl =
+    std::conditional_t<std::random_access_iterator<It>,
+                       std::random_access_iterator_tag,
+                       std::conditional_t<std::forward_iterator<It>,
+                                          std::forward_iterator_tag,
+                                          std::input_iterator_tag>>;
+
+template <typename It>
+using iterator_tag_dispatch =
+    iterator_tag_dispatch_impl<typename move_iterator_dispatch_helper<It>::type>;
+
+#else
 template<typename It>
 using iterator_tag_dispatch = typename
     std::conditional<
@@ -418,6 +446,7 @@ using iterator_tag_dispatch = typename
             std::input_iterator_tag
         >::type
     >::type;
+#endif // __TBB_CPP20_PRESENT
 
 template <typename Body, typename Iterator, typename Item>
 using feeder_is_required = tbb::detail::void_t<decltype(tbb::detail::invoke(std::declval<const Body>(),
diff --git a/third-party/tbb/include/oneapi/tbb/parallel_reduce.h b/third-party/tbb/include/oneapi/tbb/parallel_reduce.h
index 401ad004..205c97ef 100644
--- a/third-party/tbb/include/oneapi/tbb/parallel_reduce.h
+++ b/third-party/tbb/include/oneapi/tbb/parallel_reduce.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -42,16 +42,16 @@ concept parallel_reduce_body = splittable<Body> &&
 
 template <typename Function, typename Range, typename Value>
 concept parallel_reduce_function = std::invocable<const std::remove_reference_t<Function>&,
-                                                  const Range&, const Value&> &&
+                                                  const Range&, Value&&> &&
                                    std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Function>&,
-                                                                            const Range&, const Value&>,
+                                                                            const Range&, Value&&>,
                                                         Value>;
 
 template <typename Combine, typename Value>
 concept parallel_reduce_combine = std::invocable<const std::remove_reference_t<Combine>&,
-                                                 const Value&, const Value&> &&
+                                                 Value&&, Value&&> &&
                                   std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Combine>&,
-                                                                           const Value&, const Value&>,
+                                                                           Value&&, Value&&>,
                                                       Value>;
 
 } // namespace d0
@@ -390,14 +390,15 @@ class lambda_reduce_body {
         , my_value(other.my_identity_element)
     { }
     void operator()(Range& range) {
-        my_value = tbb::detail::invoke(my_real_body, range, const_cast<const Value&>(my_value));
+        my_value = tbb::detail::invoke(my_real_body, range, std::move(my_value));
     }
+
     void join( lambda_reduce_body& rhs ) {
-        my_value = tbb::detail::invoke(my_reduction, const_cast<const Value&>(my_value),
-                                                     const_cast<const Value&>(rhs.my_value));
+        my_value = tbb::detail::invoke(my_reduction, std::move(my_value), std::move(rhs.my_value));
     }
-    Value result() const {
-        return my_value;
+
+    __TBB_nodiscard Value&& result() && noexcept {
+        return std::move(my_value);
     }
 };
 
@@ -514,7 +515,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
                           ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction and simple_partitioner.
@@ -527,7 +528,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
                           ::run(range, body, partitioner );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction and auto_partitioner
@@ -540,7 +541,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
                           ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction and static_partitioner
@@ -553,7 +554,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
                                         ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction and affinity_partitioner
@@ -566,7 +567,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
                                         ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, default partitioner and user-supplied context.
@@ -579,7 +580,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
                           ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
@@ -592,7 +593,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
                           ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
@@ -605,7 +606,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
                           ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, static_partitioner and user-supplied context
@@ -618,7 +619,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
                                         ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
@@ -631,7 +632,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
                                         ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with deterministic reduction and default simple partitioner.
@@ -704,7 +705,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
                           ::run(range, body, partitioner);
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with deterministic reduction and static partitioner.
@@ -716,7 +717,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
         ::run(range, body, partitioner);
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
@@ -739,7 +740,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
         ::run(range, body, partitioner, context);
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
@@ -752,7 +753,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
         ::run(range, body, partitioner, context);
-    return body.result();
+    return std::move(body).result();
 }
 //@}
 
diff --git a/third-party/tbb/include/oneapi/tbb/parallel_scan.h b/third-party/tbb/include/oneapi/tbb/parallel_scan.h
index 6d2a4d64..d624f7eb 100644
--- a/third-party/tbb/include/oneapi/tbb/parallel_scan.h
+++ b/third-party/tbb/include/oneapi/tbb/parallel_scan.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -562,7 +562,7 @@ class lambda_scan_body {
 template<typename Range, typename Body>
     __TBB_requires(tbb_range<Range> && parallel_scan_body<Body, Range>)
 void parallel_scan( const Range& range, Body& body ) {
-    start_scan<Range, Body, auto_partitioner>::run(range,body,__TBB_DEFAULT_PARTITIONER());
+    start_scan<Range, Body, __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
 }
 
 //! Parallel prefix with simple_partitioner
diff --git a/third-party/tbb/include/oneapi/tbb/version.h b/third-party/tbb/include/oneapi/tbb/version.h
index db4f5f20..fff3e7e2 100644
--- a/third-party/tbb/include/oneapi/tbb/version.h
+++ b/third-party/tbb/include/oneapi/tbb/version.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -29,18 +29,22 @@
 // Product version
 #define TBB_VERSION_MAJOR 2021
 // Update version
-#define TBB_VERSION_MINOR 11
+#define TBB_VERSION_MINOR 13
 // "Patch" version for custom releases
 #define TBB_VERSION_PATCH 0
 // Suffix string
 #define __TBB_VERSION_SUFFIX ""
 // Full official version string
-#define TBB_VERSION_STRING __TBB_STRING(TBB_VERSION_MAJOR) "." __TBB_STRING(TBB_VERSION_MINOR) __TBB_VERSION_SUFFIX
+#define TBB_VERSION_STRING              \
+    __TBB_STRING(TBB_VERSION_MAJOR) "." \
+    __TBB_STRING(TBB_VERSION_MINOR) "." \
+    __TBB_STRING(TBB_VERSION_PATCH)     \
+    __TBB_VERSION_SUFFIX
 
 // OneAPI oneTBB specification version
 #define ONETBB_SPEC_VERSION "1.0"
 // Full interface version
-#define TBB_INTERFACE_VERSION 12110
+#define TBB_INTERFACE_VERSION 12130
 // Major interface version
 #define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000)
 // Minor interface version
@@ -51,37 +55,37 @@
 #define __TBB_BINARY_VERSION 12
 
 //! TBB_VERSION support
-#ifndef ENDL
-#define ENDL "\n"
+#ifndef TBB_ENDL
+#define TBB_ENDL "\n"
 #endif
 
 //TBB_REVAMP_TODO: consider enabling version_string.ver generation
 //TBB_REVAMP_TODO: #include "version_string.ver"
 
-#define __TBB_ONETBB_SPEC_VERSION(N) #N ": SPECIFICATION VERSION\t" ONETBB_SPEC_VERSION ENDL
-#define __TBB_VERSION_NUMBER(N) #N ": VERSION\t\t" TBB_VERSION_STRING ENDL
-#define __TBB_INTERFACE_VERSION_NUMBER(N) #N ": INTERFACE VERSION\t" __TBB_STRING(TBB_INTERFACE_VERSION) ENDL
+#define __TBB_ONETBB_SPEC_VERSION(N) #N ": SPECIFICATION VERSION\t" ONETBB_SPEC_VERSION TBB_ENDL
+#define __TBB_VERSION_NUMBER(N) #N ": VERSION\t\t" TBB_VERSION_STRING TBB_ENDL
+#define __TBB_INTERFACE_VERSION_NUMBER(N) #N ": INTERFACE VERSION\t" __TBB_STRING(TBB_INTERFACE_VERSION) TBB_ENDL
 
 #ifndef TBB_USE_DEBUG
-    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\tundefined" ENDL
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\tundefined" TBB_ENDL
 #elif TBB_USE_DEBUG==0
-    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t0" ENDL
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t0" TBB_ENDL
 #elif TBB_USE_DEBUG==1
-    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t1" ENDL
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t1" TBB_ENDL
 #elif TBB_USE_DEBUG==2
-    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t2" ENDL
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t2" TBB_ENDL
 #else
     #error Unexpected value for TBB_USE_DEBUG
 #endif
 
 #ifndef TBB_USE_ASSERT
-    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\tundefined" ENDL
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\tundefined" TBB_ENDL
 #elif TBB_USE_ASSERT==0
-    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t0" ENDL
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t0" TBB_ENDL
 #elif TBB_USE_ASSERT==1
-    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t1" ENDL
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t1" TBB_ENDL
 #elif TBB_USE_ASSERT==2
-    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t2" ENDL
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t2" TBB_ENDL
 #else
     #error Unexpected value for TBB_USE_ASSERT
 #endif
diff --git a/third-party/tbb/integration/linux/modulefiles/tbb b/third-party/tbb/integration/linux/modulefiles/tbb
index ab08c352..b8c695ed 100644
--- a/third-party/tbb/integration/linux/modulefiles/tbb
+++ b/third-party/tbb/integration/linux/modulefiles/tbb
@@ -66,3 +66,4 @@ prepend-path CPATH "$tbbroot/include"
 prepend-path LIBRARY_PATH "$tbbroot/lib"
 prepend-path LD_LIBRARY_PATH "$tbbroot/lib"
 prepend-path CMAKE_PREFIX_PATH "$tbbroot"
+prepend-path PKG_CONFIG_PATH "$tbbroot/lib/pkgconfig"
diff --git a/third-party/tbb/integration/linux/modulefiles/tbb32 b/third-party/tbb/integration/linux/modulefiles/tbb32
index 9d0efc5a..db341351 100644
--- a/third-party/tbb/integration/linux/modulefiles/tbb32
+++ b/third-party/tbb/integration/linux/modulefiles/tbb32
@@ -66,3 +66,4 @@ prepend-path CPATH "$tbbroot/include32:$tbbroot/include"
 prepend-path LIBRARY_PATH "$tbbroot/lib32"
 prepend-path LD_LIBRARY_PATH "$tbbroot/lib32"
 prepend-path CMAKE_PREFIX_PATH "$tbbroot"
+prepend-path PKG_CONFIG_PATH "$tbbroot/lib32/pkgconfig"
diff --git a/third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets b/third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets
index ab1f244f..1c94a12c 100644
--- a/third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets
+++ b/third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <!--
-    Copyright (c) 2019-2021 Intel Corporation
+    Copyright (c) 2019-2023 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
   <!-- include files -->
   <ItemDefinitionGroup>
     <ClCompile>
-      <AdditionalIncludeDirectories>$(MSBuildThisFileDirectory)..\..\lib\native\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(MSBuildThisFileDirectory)..\..\build\native\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions Condition="'$(Configuration)' == 'Debug'">TBB_USE_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
   </ItemDefinitionGroup>
@@ -27,25 +27,25 @@
   <!-- .lib files -->
   <ItemDefinitionGroup Condition="$(Configuration.ToLower().Contains('release')) AND '$(Platform)' == 'Win32'">
     <Link>
-      <AdditionalLibraryDirectories>$(MSBuildThisFileDirectory)..\..\lib\native\win-x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(MSBuildThisFileDirectory)..\..\build\native\win-x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>tbb12.lib;tbbmalloc.lib;tbbmalloc_proxy.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="$(Configuration.ToLower().Contains('release')) AND '$(Platform)' == 'x64'">
     <Link>
-      <AdditionalLibraryDirectories>$(MSBuildThisFileDirectory)..\..\lib\native\win-x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(MSBuildThisFileDirectory)..\..\build\native\win-x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>tbb12.lib;tbbmalloc.lib;tbbmalloc_proxy.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="$(Configuration.ToLower().Contains('debug')) AND '$(Platform)' == 'Win32'">
     <Link>
-      <AdditionalLibraryDirectories>$(MSBuildThisFileDirectory)..\..\lib\native\win-x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(MSBuildThisFileDirectory)..\..\build\native\win-x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>tbb12_debug.lib;tbbmalloc_debug.lib;tbbmalloc_proxy_debug.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="$(Configuration.ToLower().Contains('debug')) AND '$(Platform)' == 'x64'">
     <Link>
-      <AdditionalLibraryDirectories>$(MSBuildThisFileDirectory)..\..\lib\native\win-x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(MSBuildThisFileDirectory)..\..\build\native\win-x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>tbb12_debug.lib;tbbmalloc_debug.lib;tbbmalloc_proxy_debug.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
diff --git a/third-party/tbb/python/tbb/pool.py b/third-party/tbb/python/tbb/pool.py
index a372324d..dd5c8190 100644
--- a/third-party/tbb/python/tbb/pool.py
+++ b/third-party/tbb/python/tbb/pool.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016-2023 Intel Corporation
+# Copyright (c) 2016-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -89,8 +89,8 @@ class Pool(object):
 
     def __init__(self, nworkers=0, name="Pool"):
         """
-        \param nworkers (integer) number of worker threads to start
-        \param name (string) prefix for the worker threads' name
+        :param nworkers (integer) number of worker threads to start
+        :param name (string) prefix for the worker threads' name
         """
         self._closed = False
         self._tasks = task_group()
@@ -268,8 +268,8 @@ class Job:
 
     def __init__(self, func, args, kwds, apply_result):
         """
-        \param func/args/kwds used to call the function
-        \param apply_result ApplyResult object that holds the result
+        :param func/args/kwds used to call the function
+        :param apply_result ApplyResult object that holds the result
         of the function call
         """
         self._func = func
@@ -317,10 +317,10 @@ class ApplyResult(object):
 
     def __init__(self, collector=None, callback=None):
         """
-        \param collector when not None, the notify_ready() method of
+        :param collector when not None, the notify_ready() method of
         the collector will be called when the result from the Job is
         ready
-        \param callback when not None, function to call when the
+        :param callback when not None, function to call when the
         result becomes available (this is the parameter passed to the
         Pool::*_async() methods.
         """
@@ -404,7 +404,7 @@ class AbstractResultCollector(object):
 
     def __init__(self, to_notify):
         """
-        \param to_notify ApplyResult object to notify when all the
+        :param to_notify ApplyResult object to notify when all the
         results we're waiting for become available. Can be None.
         """
         self._to_notify = to_notify
@@ -414,7 +414,7 @@ def register_result(self, apply_result):
         always be called BEFORE the Jobs get submitted to the work
         queue, and BEFORE the __iter__ and _get_result() methods can
         be called
-        \param apply_result ApplyResult object to add in our collection
+        :param apply_result ApplyResult object to add in our collection
         """
         raise NotImplementedError("Children classes must implement it")
 
@@ -422,7 +422,7 @@ def notify_ready(self, apply_result):
         """Called by the ApplyResult object (already registered via
         register_result()) that it is now ready (ie. the Job's result
         is available or an exception has been raised).
-        \param apply_result ApplyResult object telling us that the job
+        :param apply_result ApplyResult object telling us that the job
         has been processed
         """
         raise NotImplementedError("Children classes must implement it")
@@ -431,8 +431,8 @@ def _get_result(self, idx, timeout=None):
         """Called by the CollectorIterator object to retrieve the
         result's values one after another (order defined by the
         implementation)
-        \param idx The index of the result we want, wrt collector's order
-        \param timeout integer telling how long to wait (in seconds)
+        :param idx The index of the result we want, wrt collector's order
+        :param timeout integer telling how long to wait (in seconds)
         for the result at index idx to be available, or None (wait
         forever)
         """
@@ -450,7 +450,7 @@ class CollectorIterator(object):
     AbstractResultCollector::__iter__() method"""
 
     def __init__(self, collector):
-        """\param AbstractResultCollector instance"""
+        """:param AbstractResultCollector instance"""
         self._collector = collector
         self._idx = 0
 
@@ -486,7 +486,7 @@ class UnorderedResultCollector(AbstractResultCollector):
 
     def __init__(self, to_notify=None):
         """
-        \param to_notify ApplyResult object to notify when all the
+        :param to_notify ApplyResult object to notify when all the
         results we're waiting for become available. Can be None.
         """
         AbstractResultCollector.__init__(self, to_notify)
@@ -499,7 +499,7 @@ def register_result(self, apply_result):
         always be called BEFORE the Jobs get submitted to the work
         queue, and BEFORE the __iter__ and _get_result() methods can
         be called
-        \param apply_result ApplyResult object to add in our collection
+        :param apply_result ApplyResult object to add in our collection
         """
         self._expected += 1
 
@@ -507,8 +507,8 @@ def _get_result(self, idx, timeout=None):
         """Called by the CollectorIterator object to retrieve the
         result's values one after another, in the order the results have
         become available.
-        \param idx The index of the result we want, wrt collector's order
-        \param timeout integer telling how long to wait (in seconds)
+        :param idx The index of the result we want, wrt collector's order
+        :param timeout integer telling how long to wait (in seconds)
         for the result at index idx to be available, or None (wait
         forever)
         """
@@ -535,7 +535,7 @@ def notify_ready(self, apply_result=None):
         """Called by the ApplyResult object (already registered via
         register_result()) that it is now ready (ie. the Job's result
         is available or an exception has been raised).
-        \param apply_result ApplyResult object telling us that the job
+        :param apply_result ApplyResult object telling us that the job
         has been processed
         """
         first_item = False
@@ -560,9 +560,9 @@ class OrderedResultCollector(AbstractResultCollector):
 
     def __init__(self, to_notify=None, as_iterator=True):
         """
-        \param to_notify ApplyResult object to notify when all the
+        :param to_notify ApplyResult object to notify when all the
         results we're waiting for become available. Can be None.
-        \param as_iterator boolean telling whether the result value
+        :param as_iterator boolean telling whether the result value
         set on to_notify should be an iterator (available as soon as 1
         result arrived) or a list (available only after the last
         result arrived)
@@ -578,7 +578,7 @@ def register_result(self, apply_result):
         always be called BEFORE the Jobs get submitted to the work
         queue, and BEFORE the __iter__ and _get_result() methods can
         be called
-        \param apply_result ApplyResult object to add in our collection
+        :param apply_result ApplyResult object to add in our collection
         """
         self._results.append(apply_result)
         self._remaining += 1
@@ -587,8 +587,8 @@ def _get_result(self, idx, timeout=None):
         """Called by the CollectorIterator object to retrieve the
         result's values one after another (order defined by the
         implementation)
-        \param idx The index of the result we want, wrt collector's order
-        \param timeout integer telling how long to wait (in seconds)
+        :param idx The index of the result we want, wrt collector's order
+        :param timeout integer telling how long to wait (in seconds)
         for the result at index idx to be available, or None (wait
         forever)
         """
@@ -600,7 +600,7 @@ def notify_ready(self, apply_result):
         """Called by the ApplyResult object (already registered via
         register_result()) that it is now ready (ie. the Job's result
         is available or an exception has been raised).
-        \param apply_result ApplyResult object telling us that the job
+        :param apply_result ApplyResult object telling us that the job
         has been processed
         """
         got_first = False
diff --git a/third-party/tbb/src/tbb/CMakeLists.txt b/third-party/tbb/src/tbb/CMakeLists.txt
index e3df9fc9..b996c736 100644
--- a/third-party/tbb/src/tbb/CMakeLists.txt
+++ b/third-party/tbb/src/tbb/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -126,59 +126,71 @@ target_link_libraries(tbb
     ${TBB_COMMON_LINK_LIBS}
 )
 
-tbb_install_target(tbb)
-
-if (MSVC)
-    # Create a copy of target linker file (tbb<ver>[_debug].lib) with legacy name (tbb[_debug].lib)
-    # to support previous user experience for linkage.
-    install(FILES
-            $<TARGET_LINKER_FILE:tbb>
-            DESTINATION lib
-            CONFIGURATIONS RelWithDebInfo Release MinSizeRel
-            RENAME tbb.lib
-            COMPONENT devel
-    )
-
-    install(FILES
-            $<TARGET_LINKER_FILE:tbb>
-            DESTINATION lib
-            CONFIGURATIONS Debug
-            RENAME tbb_debug.lib
-            COMPONENT devel
-    )
-endif()
-
-set(_tbb_pc_lib_name tbb)
-
-if (WIN32)
-    set(_tbb_pc_lib_name ${_tbb_pc_lib_name}${TBB_BINARY_VERSION})
-endif()
-
-if (CMAKE_SIZEOF_VOID_P EQUAL 8)
-    set(TBB_PC_NAME tbb)
-else()
-    set(TBB_PC_NAME tbb32)
+if(TBB_BUILD_APPLE_FRAMEWORKS)
+    set_target_properties(tbb PROPERTIES
+        FRAMEWORK TRUE
+        FRAMEWORK_VERSION ${TBB_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
+        XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbb
+        MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbb
+        MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBB_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
+        MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBB_BINARY_VERSION})
 endif()
 
-set(_prefix_for_pc_file "${CMAKE_INSTALL_PREFIX}")
-
-if (IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
-    set(_libdir_for_pc_file "${CMAKE_INSTALL_LIBDIR}")
-else()
-    set(_libdir_for_pc_file "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
-endif()
+tbb_install_target(tbb)
 
-if (IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
-    set(_includedir_for_pc_file "${CMAKE_INSTALL_INCLUDEDIR}")
-else()
-    set(_includedir_for_pc_file "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+if (TBB_INSTALL)
+    if (MSVC)
+        # Create a copy of target linker file (tbb<ver>[_debug].lib) with legacy name (tbb[_debug].lib)
+        # to support previous user experience for linkage.
+        install(FILES
+                $<TARGET_LINKER_FILE:tbb>
+                DESTINATION lib
+                CONFIGURATIONS RelWithDebInfo Release MinSizeRel
+                RENAME tbb.lib
+                COMPONENT devel
+        )
+
+        install(FILES
+                $<TARGET_LINKER_FILE:tbb>
+                DESTINATION lib
+                CONFIGURATIONS Debug
+                RENAME tbb_debug.lib
+                COMPONENT devel
+        )
+    endif()
+
+    set(_tbb_pc_lib_name tbb)
+
+    if (WIN32)
+        set(_tbb_pc_lib_name ${_tbb_pc_lib_name}${TBB_BINARY_VERSION})
+    endif()
+
+    if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+        set(TBB_PC_NAME tbb)
+    else()
+        set(TBB_PC_NAME tbb32)
+    endif()
+
+    set(_prefix_for_pc_file "${CMAKE_INSTALL_PREFIX}")
+
+    if (IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
+        set(_libdir_for_pc_file "${CMAKE_INSTALL_LIBDIR}")
+    else()
+        set(_libdir_for_pc_file "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
+    endif()
+
+    if (IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
+        set(_includedir_for_pc_file "${CMAKE_INSTALL_INCLUDEDIR}")
+    else()
+        set(_includedir_for_pc_file "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    endif()
+
+    configure_file(${PROJECT_SOURCE_DIR}/integration/pkg-config/tbb.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc @ONLY)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc
+            DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/
+            COMPONENT devel)
 endif()
 
-configure_file(${PROJECT_SOURCE_DIR}/integration/pkg-config/tbb.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/
-        COMPONENT devel)
-
 if (COMMAND tbb_gen_vars)
     tbb_gen_vars(tbb)
 endif()
diff --git a/third-party/tbb/src/tbb/arena.cpp b/third-party/tbb/src/tbb/arena.cpp
index 41770fe5..0e7cf43c 100644
--- a/third-party/tbb/src/tbb/arena.cpp
+++ b/third-party/tbb/src/tbb/arena.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -60,7 +60,6 @@ numa_binding_observer* construct_binding_observer( d1::task_arena* ta, int num_s
     if ((core_type >= 0 && core_type_count() > 1) || (numa_id >= 0 && numa_node_count() > 1) || max_threads_per_core > 0) {
         binding_observer = new(allocate_memory(sizeof(numa_binding_observer))) numa_binding_observer(ta, num_slots, numa_id, core_type, max_threads_per_core);
         __TBB_ASSERT(binding_observer, "Failure during NUMA binding observer allocation and construction");
-        binding_observer->observe(true);
     }
     return binding_observer;
 }
@@ -396,7 +395,7 @@ bool arena::is_top_priority() const {
 }
 
 bool arena::try_join() {
-    if (num_workers_active() < my_num_workers_allotted.load(std::memory_order_relaxed)) {
+    if (is_joinable()) {
         my_references += arena::ref_worker;
         return true;
     }
@@ -545,7 +544,7 @@ void task_arena_impl::initialize(d1::task_arena_base& ta) {
         .set_max_threads_per_core(ta.max_threads_per_core())
         .set_numa_id(ta.my_numa_id);
 #endif /*__TBB_ARENA_BINDING*/
-    
+
     if (ta.my_max_concurrency < 1) {
 #if __TBB_ARENA_BINDING
         ta.my_max_concurrency = (int)default_concurrency(arena_constraints);
@@ -554,6 +553,17 @@ void task_arena_impl::initialize(d1::task_arena_base& ta) {
 #endif /*!__TBB_ARENA_BINDING*/
     }
 
+#if __TBB_CPUBIND_PRESENT
+    numa_binding_observer* observer = construct_binding_observer(
+        static_cast<d1::task_arena*>(&ta), arena::num_arena_slots(ta.my_max_concurrency, ta.my_num_reserved_slots),
+        ta.my_numa_id, ta.core_type(), ta.max_threads_per_core());
+    if (observer) {
+        // TODO: Consider lazy initialization for internal arena so
+        // the direct calls to observer might be omitted until actual initialization. 
+        observer->on_scheduler_entry(true);
+    }
+#endif /*__TBB_CPUBIND_PRESENT*/
+
     __TBB_ASSERT(ta.my_arena.load(std::memory_order_relaxed) == nullptr, "Arena already initialized");
     unsigned priority_level = arena_priority_level(ta.my_priority);
     threading_control* thr_control = threading_control::register_public_reference();
@@ -561,8 +571,11 @@ void task_arena_impl::initialize(d1::task_arena_base& ta) {
 
     ta.my_arena.store(&a, std::memory_order_release);
 #if __TBB_CPUBIND_PRESENT
-    a.my_numa_binding_observer = construct_binding_observer(
-        static_cast<d1::task_arena*>(&ta), a.my_num_slots, ta.my_numa_id, ta.core_type(), ta.max_threads_per_core());
+    a.my_numa_binding_observer = observer;
+    if (observer) {
+        observer->on_scheduler_exit(true);
+        observer->observe(true);
+    }
 #endif /*__TBB_CPUBIND_PRESENT*/
 }
 
diff --git a/third-party/tbb/src/tbb/arena.h b/third-party/tbb/src/tbb/arena.h
index 61bda0bf..1e95f117 100644
--- a/third-party/tbb/src/tbb/arena.h
+++ b/third-party/tbb/src/tbb/arena.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -385,6 +385,10 @@ class arena: public padded<arena_base>
 
     bool is_top_priority() const;
 
+    bool is_joinable() const {
+        return num_workers_active() < my_num_workers_allotted.load(std::memory_order_relaxed);
+    }
+
     bool try_join();
 
     void set_allotment(unsigned allotment);
@@ -429,8 +433,7 @@ void arena::advertise_new_work() {
             workers_delta = 1;
         }
 
-        bool wakeup_workers = is_mandatory_needed || are_workers_needed;
-        request_workers(mandatory_delta, workers_delta, wakeup_workers);
+        request_workers(mandatory_delta, workers_delta, /* wakeup_threads = */ true);
     }
 }
 
diff --git a/third-party/tbb/src/tbb/concurrent_monitor.h b/third-party/tbb/src/tbb/concurrent_monitor.h
index 3d20ef5b..3e5c4beb 100644
--- a/third-party/tbb/src/tbb/concurrent_monitor.h
+++ b/third-party/tbb/src/tbb/concurrent_monitor.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2023 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -290,7 +290,17 @@ class concurrent_monitor_base {
             n = my_waitset.front();
             if (n != end) {
                 my_waitset.remove(*n);
+
+// GCC 12.x-13.x issues a warning here that to_wait_node(n)->my_is_in_list might have size 0, since n is
+// a base_node pointer. (This cannot happen, because only wait_node pointers are added to my_waitset.)
+#if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
                 to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+#if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER
+#pragma GCC diagnostic pop
+#endif
             }
         }
 
diff --git a/third-party/tbb/src/tbb/dynamic_link.cpp b/third-party/tbb/src/tbb/dynamic_link.cpp
index 2d88f8bc..a21beb5a 100644
--- a/third-party/tbb/src/tbb/dynamic_link.cpp
+++ b/third-party/tbb/src/tbb/dynamic_link.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -34,7 +34,8 @@
     // Unify system calls
     #define dlopen( name, flags )   LoadLibrary( name )
     #define dlsym( handle, name )   GetProcAddress( handle, name )
-    #define dlclose( handle )       ( ! FreeLibrary( handle ) )
+    // FreeLibrary return bool value that is not used.
+    #define dlclose( handle )       (void)( ! FreeLibrary( handle ) )
     #define dlerror()               GetLastError()
 #ifndef PATH_MAX
     #define PATH_MAX                MAX_PATH
diff --git a/third-party/tbb/src/tbb/global_control.cpp b/third-party/tbb/src/tbb/global_control.cpp
index 12c146c2..127fc92d 100644
--- a/third-party/tbb/src/tbb/global_control.cpp
+++ b/third-party/tbb/src/tbb/global_control.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "oneapi/tbb/detail/_config.h"
 #include "oneapi/tbb/detail/_template_helpers.h"
 
+#include "oneapi/tbb/cache_aligned_allocator.h"
 #include "oneapi/tbb/global_control.h"
 #include "oneapi/tbb/tbb_allocator.h"
 #include "oneapi/tbb/spin_mutex.h"
@@ -49,6 +50,7 @@ class control_storage {
     std::set<d1::global_control*, control_storage_comparator, tbb_allocator<d1::global_control*>> my_list{};
     spin_mutex my_list_mutex{};
 public:
+    virtual ~control_storage() = default;
     virtual std::size_t default_value() const = 0;
     virtual void apply_active(std::size_t new_active) {
         my_active_value = new_active;
@@ -138,11 +140,22 @@ class alignas(max_nfs_size) lifetime_control : public control_storage {
     }
 };
 
-static allowed_parallelism_control allowed_parallelism_ctl;
-static stack_size_control stack_size_ctl;
-static terminate_on_exception_control terminate_on_exception_ctl;
-static lifetime_control lifetime_ctl;
-static control_storage *controls[] = {&allowed_parallelism_ctl, &stack_size_ctl, &terminate_on_exception_ctl, &lifetime_ctl};
+static control_storage* controls[] = {nullptr, nullptr, nullptr, nullptr};
+
+void global_control_acquire() {
+    controls[0] = new (cache_aligned_allocate(sizeof(allowed_parallelism_control))) allowed_parallelism_control{};
+    controls[1] = new (cache_aligned_allocate(sizeof(stack_size_control))) stack_size_control{};
+    controls[2] = new (cache_aligned_allocate(sizeof(terminate_on_exception_control))) terminate_on_exception_control{};
+    controls[3] = new (cache_aligned_allocate(sizeof(lifetime_control))) lifetime_control{};
+}
+
+void global_control_release() {
+    for (auto& ptr : controls) {
+        ptr->~control_storage();
+        cache_aligned_deallocate(ptr);
+        ptr = nullptr;
+    }
+}
 
 void global_control_lock() {
     for (auto& ctl : controls) {
diff --git a/third-party/tbb/src/tbb/governor.cpp b/third-party/tbb/src/tbb/governor.cpp
index 1a66f5de..55175196 100644
--- a/third-party/tbb/src/tbb/governor.cpp
+++ b/third-party/tbb/src/tbb/governor.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -42,6 +42,8 @@ namespace detail {
 namespace r1 {
 
 void clear_address_waiter_table();
+void global_control_acquire();
+void global_control_release();
 
 //! global_control.cpp contains definition
 bool remove_and_check_if_empty(d1::global_control& gc);
@@ -60,6 +62,7 @@ namespace system_topology {
 //------------------------------------------------------------------------
 
 void governor::acquire_resources () {
+    global_control_acquire();
 #if __TBB_USE_POSIX
     int status = theTLS.create(auto_terminate);
 #else
@@ -85,6 +88,7 @@ void governor::release_resources () {
 
     system_topology::destroy();
     dynamic_unlink_all();
+    global_control_release();
 }
 
 rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) {
diff --git a/third-party/tbb/src/tbb/governor.h b/third-party/tbb/src/tbb/governor.h
index 9d5e94d3..573443d7 100644
--- a/third-party/tbb/src/tbb/governor.h
+++ b/third-party/tbb/src/tbb/governor.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -138,6 +138,8 @@ class governor {
     static bool wait_package_enabled() { return cpu_features.waitpkg_enabled; }
 #endif
 
+    static bool hybrid_cpu() { return cpu_features.hybrid; }
+
     static bool rethrow_exception_broken() { return is_rethrow_broken; }
 
     static bool is_itt_present() {
diff --git a/third-party/tbb/src/tbb/misc.cpp b/third-party/tbb/src/tbb/misc.cpp
index 17da1238..115a5f38 100644
--- a/third-party/tbb/src/tbb/misc.cpp
+++ b/third-party/tbb/src/tbb/misc.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -92,6 +92,8 @@ void PrintExtraVersionInfo( const char* category, const char* format, ... ) {
 //! check for transaction support.
 #if _MSC_VER
 #include <intrin.h> // for __cpuid
+#elif __APPLE__
+#include <sys/sysctl.h>
 #endif
 
 #if __TBB_x86_32 || __TBB_x86_64
@@ -131,13 +133,22 @@ void detect_cpu_features(cpu_features_type& cpu_features) {
 #if __TBB_x86_32 || __TBB_x86_64
     const int rtm_ebx_mask = 1 << 11;
     const int waitpkg_ecx_mask = 1 << 5;
+    const int hybrid_edx_mask = 1 << 15;
     int registers[4] = {0};
 
-    // Check RTM and WAITPKG
+    // Check RTM, WAITPKG, HYBRID
     check_cpuid(7, 0, registers);
     cpu_features.rtm_enabled = (registers[1] & rtm_ebx_mask) != 0;
     cpu_features.waitpkg_enabled = (registers[2] & waitpkg_ecx_mask) != 0;
-#endif /* (__TBB_x86_32 || __TBB_x86_64) */
+    cpu_features.hybrid = (registers[3] & hybrid_edx_mask) != 0;
+#elif __APPLE__
+    // Check HYBRID (hw.nperflevels > 1)
+    uint64_t nperflevels = 0;
+    size_t nperflevels_size = sizeof(nperflevels);
+    if (!sysctlbyname("hw.nperflevels", &nperflevels, &nperflevels_size, nullptr, 0)) {
+        cpu_features.hybrid = (nperflevels > 1);
+    }
+#endif
 }
 
 } // namespace r1
diff --git a/third-party/tbb/src/tbb/misc.h b/third-party/tbb/src/tbb/misc.h
index b11c0029..988c29b1 100644
--- a/third-party/tbb/src/tbb/misc.h
+++ b/third-party/tbb/src/tbb/misc.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -211,6 +211,7 @@ T1 atomic_update(std::atomic<T1>& dst, T1 newValue, Pred compare) {
 struct cpu_features_type {
     bool rtm_enabled{false};
     bool waitpkg_enabled{false};
+    bool hybrid{false};
 };
 
 void detect_cpu_features(cpu_features_type& cpu_features);
diff --git a/third-party/tbb/src/tbb/scheduler_common.h b/third-party/tbb/src/tbb/scheduler_common.h
index 56610ffe..f9e8a68d 100644
--- a/third-party/tbb/src/tbb/scheduler_common.h
+++ b/third-party/tbb/src/tbb/scheduler_common.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -268,12 +268,7 @@ class stealing_loop_backoff {
     // no worse than 2x the optimal spin time. Or perhaps a time-slice quantum is the right amount.
     stealing_loop_backoff(int num_workers, int yields_multiplier)
         : my_pause_threshold{ 2 * (num_workers + 1) }
-#if __APPLE__
-        // threshold value tuned separately for macOS due to high cost of sched_yield there
-        , my_yield_threshold{10 * yields_multiplier}
-#else
         , my_yield_threshold{100 * yields_multiplier}
-#endif
         , my_pause_count{}
         , my_yield_count{}
     {}
diff --git a/third-party/tbb/src/tbb/tbb.rc b/third-party/tbb/src/tbb/tbb.rc
index 6c8b99fc..57e9d391 100644
--- a/third-party/tbb/src/tbb/tbb.rc
+++ b/third-party/tbb/src/tbb/tbb.rc
@@ -1,4 +1,4 @@
-// Copyright (c) 2005-2023 Intel Corporation
+// Copyright (c) 2005-2024 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ BEGIN
             VALUE "CompanyName", "Intel Corporation\0"
             VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0"
             VALUE "FileVersion", TBB_VERSION "\0"
-            VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation.  All Rights Reserved.\0"
+            VALUE "LegalCopyright", "Copyright 2005-2024 Intel Corporation.  All Rights Reserved.\0"
             VALUE "LegalTrademarks", "\0"
 #ifndef TBB_USE_DEBUG
             VALUE "OriginalFilename", "tbb12.dll\0"
diff --git a/third-party/tbb/src/tbb/tcm_adaptor.cpp b/third-party/tbb/src/tbb/tcm_adaptor.cpp
index 3963ae13..e20ebb83 100644
--- a/third-party/tbb/src/tbb/tcm_adaptor.cpp
+++ b/third-party/tbb/src/tbb/tcm_adaptor.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2023 Intel Corporation
+    Copyright (c) 2023-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -137,11 +137,9 @@ class tcm_client : public pm_client {
             // The permit has changed during the reading, so the callback will be invoked soon one more time and
             // we can just skip this renegotiation iteration.
             if (!new_permit.flags.stale) {
-                __TBB_ASSERT(
-                    new_permit.state != TCM_PERMIT_STATE_INACTIVE || new_concurrency == 0,
-                    "TCM did not nullify resources while deactivating the permit"
-                );
-                delta = update_concurrency(new_concurrency);
+                // If there is no other demand in TCM, the permit may still have granted concurrency but
+                // be in the deactivated state thus we enforce 0 allotment to preserve arena invariants.
+                delta = update_concurrency(new_permit.state != TCM_PERMIT_STATE_INACTIVE ? new_concurrency : 0);
             }
         }
         if (delta) {
diff --git a/third-party/tbb/src/tbb/thread_dispatcher.cpp b/third-party/tbb/src/tbb/thread_dispatcher.cpp
index 8f33dc06..69a108d6 100644
--- a/third-party/tbb/src/tbb/thread_dispatcher.cpp
+++ b/third-party/tbb/src/tbb/thread_dispatcher.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2022-2023 Intel Corporation
+    Copyright (c) 2022-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -164,6 +164,18 @@ thread_dispatcher_client* thread_dispatcher::client_in_need(thread_dispatcher_cl
     return client_in_need(my_client_list, my_next_client);
 }
 
+bool thread_dispatcher::is_any_client_in_need() {
+    client_list_mutex_type::scoped_lock lock(my_list_mutex, /*is_writer=*/false);
+    for (auto& priority_list : my_client_list) {
+        for (auto& client : priority_list) {
+            if (client.is_joinable()) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 void thread_dispatcher::adjust_job_count_estimate(int delta) {
     my_server->adjust_job_count_estimate(delta);
 }
diff --git a/third-party/tbb/src/tbb/thread_dispatcher.h b/third-party/tbb/src/tbb/thread_dispatcher.h
index f11344ca..e511e2b7 100644
--- a/third-party/tbb/src/tbb/thread_dispatcher.h
+++ b/third-party/tbb/src/tbb/thread_dispatcher.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2022-2023 Intel Corporation
+    Copyright (c) 2022-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -44,6 +44,7 @@ class thread_dispatcher : no_copy, rml::tbb_client {
     thread_dispatcher_client* create_client(arena& a);
     void register_client(thread_dispatcher_client* client);
     bool try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority);
+    bool is_any_client_in_need();
 
     void adjust_job_count_estimate(int delta);
     void release(bool blocking_terminate);
diff --git a/third-party/tbb/src/tbb/thread_dispatcher_client.h b/third-party/tbb/src/tbb/thread_dispatcher_client.h
index c93ff31d..f7c199cb 100644
--- a/third-party/tbb/src/tbb/thread_dispatcher_client.h
+++ b/third-party/tbb/src/tbb/thread_dispatcher_client.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2022-2023 Intel Corporation
+    Copyright (c) 2022-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -32,6 +32,11 @@ class thread_dispatcher_client : public d1::intrusive_list_node /* Need for list
     bool try_join() {
         return my_arena.try_join();
     }
+
+    bool is_joinable() {
+        return my_arena.is_joinable();
+    }
+
     void process(thread_data& td) {
         my_arena.process(td);
     }
diff --git a/third-party/tbb/src/tbb/thread_request_serializer.cpp b/third-party/tbb/src/tbb/thread_request_serializer.cpp
index 5973f14c..6019f732 100644
--- a/third-party/tbb/src/tbb/thread_request_serializer.cpp
+++ b/third-party/tbb/src/tbb/thread_request_serializer.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2022-2023 Intel Corporation
+    Copyright (c) 2022-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -37,8 +37,8 @@ void thread_request_serializer::update(int delta) {
     if (prev_pending_delta == pending_delta_base) {
         delta = int(my_pending_delta.exchange(pending_delta_base) & delta_mask) - int(pending_delta_base);
         mutex_type::scoped_lock lock(my_mutex);
-        my_total_request += delta;
-        delta = limit_delta(delta, my_soft_limit, my_total_request);
+        my_total_request.store(my_total_request.load(std::memory_order_relaxed) + delta, std::memory_order_relaxed);
+        delta = limit_delta(delta, my_soft_limit, my_total_request.load(std::memory_order_relaxed));
         my_thread_dispatcher.adjust_job_count_estimate(delta);
     }
 }
@@ -46,7 +46,7 @@ void thread_request_serializer::update(int delta) {
 void thread_request_serializer::set_active_num_workers(int soft_limit) {
     mutex_type::scoped_lock lock(my_mutex);
     int delta = soft_limit - my_soft_limit;
-    delta = limit_delta(delta, my_total_request, soft_limit);
+    delta = limit_delta(delta, my_total_request.load(std::memory_order_relaxed), soft_limit);
     my_thread_dispatcher.adjust_job_count_estimate(delta);
     my_soft_limit = soft_limit;
 }
@@ -109,6 +109,8 @@ void thread_request_serializer_proxy::set_active_num_workers(int soft_limit) {
     }
 }
 
+int thread_request_serializer_proxy::num_workers_requested() { return my_serializer.num_workers_requested(); }
+
 void thread_request_serializer_proxy::update(int delta) { my_serializer.update(delta); }
 
 void thread_request_serializer_proxy::enable_mandatory_concurrency(mutex_type::scoped_lock& lock) {
diff --git a/third-party/tbb/src/tbb/thread_request_serializer.h b/third-party/tbb/src/tbb/thread_request_serializer.h
index 261a46d7..9dc9799e 100644
--- a/third-party/tbb/src/tbb/thread_request_serializer.h
+++ b/third-party/tbb/src/tbb/thread_request_serializer.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2022-2023 Intel Corporation
+    Copyright (c) 2022-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -39,6 +39,7 @@ class thread_request_serializer : public thread_request_observer {
 public:
     thread_request_serializer(thread_dispatcher& td, int soft_limit);
     void set_active_num_workers(int soft_limit);
+    int num_workers_requested() { return my_total_request.load(std::memory_order_relaxed); }
     bool is_no_workers_avaliable() { return my_soft_limit == 0; }
 
 private:
@@ -48,7 +49,7 @@ class thread_request_serializer : public thread_request_observer {
 
     thread_dispatcher& my_thread_dispatcher;
     int my_soft_limit{ 0 };
-    int my_total_request{ 0 };
+    std::atomic<int> my_total_request{ 0 };
     // my_pending_delta is set to pending_delta_base to have ability to hold negative values
     // consider increase base since thead number will be bigger than 1 << 15
     static constexpr std::uint64_t pending_delta_base = 1 << 15;
@@ -63,6 +64,7 @@ class thread_request_serializer_proxy : public thread_request_observer {
     thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit);
     void register_mandatory_request(int mandatory_delta);
     void set_active_num_workers(int soft_limit);
+    int num_workers_requested();
 
 private:
     void update(int delta) override;
diff --git a/third-party/tbb/src/tbb/threading_control.cpp b/third-party/tbb/src/tbb/threading_control.cpp
index f253c83d..1ca18378 100644
--- a/third-party/tbb/src/tbb/threading_control.cpp
+++ b/third-party/tbb/src/tbb/threading_control.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2022-2023 Intel Corporation
+    Copyright (c) 2022-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -164,6 +164,10 @@ void threading_control_impl::adjust_demand(threading_control_client tc_client, i
     my_permit_manager->adjust_demand(c, mandatory_delta, workers_delta);
 }
 
+bool threading_control_impl::is_any_other_client_active() {
+    return my_thread_request_serializer->num_workers_requested() > 0 ? my_thread_dispatcher->is_any_client_in_need() : false;
+}
+
 thread_control_monitor& threading_control_impl::get_waiting_threads_monitor() {
     return *my_waiting_threads_monitor;
 }
@@ -389,6 +393,10 @@ void threading_control::adjust_demand(threading_control_client client, int manda
     my_pimpl->adjust_demand(client, mandatory_delta, workers_delta);
 }
 
+bool threading_control::is_any_other_client_active() {
+    return my_pimpl->is_any_other_client_active();
+}
+
 thread_control_monitor& threading_control::get_waiting_threads_monitor() {
     return my_pimpl->get_waiting_threads_monitor();
 }
diff --git a/third-party/tbb/src/tbb/threading_control.h b/third-party/tbb/src/tbb/threading_control.h
index 55347189..7381b297 100644
--- a/third-party/tbb/src/tbb/threading_control.h
+++ b/third-party/tbb/src/tbb/threading_control.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2022-2023 Intel Corporation
+    Copyright (c) 2022-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -69,6 +69,7 @@ class threading_control_impl {
     unsigned max_num_workers();
 
     void adjust_demand(threading_control_client, int mandatory_delta, int workers_delta);
+    bool is_any_other_client_active();
 
     thread_control_monitor& get_waiting_threads_monitor();
 
@@ -116,6 +117,7 @@ class threading_control {
     static unsigned max_num_workers();
 
     void adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta);
+    bool is_any_other_client_active();
 
     thread_control_monitor& get_waiting_threads_monitor();
 
diff --git a/third-party/tbb/src/tbb/tools_api/ittnotify.h b/third-party/tbb/src/tbb/tools_api/ittnotify.h
index d15aae26..eb1571dc 100644
--- a/third-party/tbb/src/tbb/tools_api/ittnotify.h
+++ b/third-party/tbb/src/tbb/tools_api/ittnotify.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -101,6 +101,11 @@ The same ID may not be reused for different instances, unless a previous
 #  define ITT_OS_FREEBSD   4
 #endif /* ITT_OS_FREEBSD */
 
+#ifndef ITT_OS_OPENBSD
+#  define ITT_OS_OPENBSD 5
+#endif /* ITT_OS_OPENBSD */
+
+
 #ifndef ITT_OS
 #  if defined WIN32 || defined _WIN32
 #    define ITT_OS ITT_OS_WIN
@@ -108,6 +113,8 @@ The same ID may not be reused for different instances, unless a previous
 #    define ITT_OS ITT_OS_MAC
 #  elif defined( __FreeBSD__ )
 #    define ITT_OS ITT_OS_FREEBSD
+#  elif defined( __OpenBSD__ )
+#    define ITT_OS ITT_OS_OPENBSD
 #  else
 #    define ITT_OS ITT_OS_LINUX
 #  endif
@@ -129,6 +136,10 @@ The same ID may not be reused for different instances, unless a previous
 #  define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */
 
+#ifndef ITT_PLATFORM_OPENBSD
+#  define ITT_PLATFORM_OPENBSD 5
+#endif /* ITT_PLATFORM_OPENBSD */
+
 #ifndef ITT_PLATFORM
 #  if ITT_OS==ITT_OS_WIN
 #    define ITT_PLATFORM ITT_PLATFORM_WIN
@@ -136,6 +147,8 @@ The same ID may not be reused for different instances, unless a previous
 #    define ITT_PLATFORM ITT_PLATFORM_MAC
 #  elif ITT_OS==ITT_OS_FREEBSD
 #    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  elif ITT_OS==ITT_OS_OPENBSD
+#    define ITT_PLATFORM ITT_PLATFORM_OPENBSD
 #  else
 #    define ITT_PLATFORM ITT_PLATFORM_POSIX
 #  endif
@@ -305,7 +318,7 @@ extern "C" {
  *     only pauses tracing and analyzing memory access.
  *     It does not pause tracing or analyzing threading APIs.
  *   .
- * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ * Intel(R) VTune(TM) Profiler:
  *   - Does continue to record when new threads are started.
  *   .
  * - Other effects:
diff --git a/third-party/tbb/src/tbb/tools_api/ittnotify_config.h b/third-party/tbb/src/tbb/tools_api/ittnotify_config.h
index 44edfd67..001d42e0 100644
--- a/third-party/tbb/src/tbb/tools_api/ittnotify_config.h
+++ b/third-party/tbb/src/tbb/tools_api/ittnotify_config.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -34,6 +34,10 @@
 #  define ITT_OS_FREEBSD   4
 #endif /* ITT_OS_FREEBSD */
 
+#ifndef ITT_OS_OPENBSD
+#  define ITT_OS_OPENBSD   5
+#endif /* ITT_OS_OPENBSD */
+
 #ifndef ITT_OS
 #  if defined WIN32 || defined _WIN32
 #    define ITT_OS ITT_OS_WIN
@@ -41,6 +45,8 @@
 #    define ITT_OS ITT_OS_MAC
 #  elif defined( __FreeBSD__ )
 #    define ITT_OS ITT_OS_FREEBSD
+#  elif defined( __OpenBSD__ )
+#    define ITT_OS ITT_OS_OPENBSD
 #  else
 #    define ITT_OS ITT_OS_LINUX
 #  endif
@@ -62,6 +68,10 @@
 #  define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */
 
+#ifndef ITT_PLATFORM_OPENBSD
+#  define ITT_PLATFORM_OPENBSD 5
+#endif /* ITT_PLATFORM_OPENBSD */
+
 #ifndef ITT_PLATFORM
 #  if ITT_OS==ITT_OS_WIN
 #    define ITT_PLATFORM ITT_PLATFORM_WIN
@@ -69,6 +79,8 @@
 #    define ITT_PLATFORM ITT_PLATFORM_MAC
 #  elif ITT_OS==ITT_OS_FREEBSD
 #    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  elif ITT_OS==ITT_OS_OPENBSD
+#    define ITT_PLATFORM ITT_PLATFORM_OPENBSD
 #  else
 #    define ITT_PLATFORM ITT_PLATFORM_POSIX
 #  endif
@@ -235,7 +247,7 @@
 #define API_VERSION_BUILD    20230630
 
 #ifndef API_VERSION_NUM
-#define API_VERSION_NUM 3.24.2
+#define API_VERSION_NUM 3.24.4
 #endif /* API_VERSION_NUM */
 
 #define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
@@ -634,7 +646,7 @@ typedef struct ___itt_global
         h->nameA   = NULL; \
         h->nameW   = name ? _wcsdup(name) : NULL; \
         h->domainA   = NULL; \
-        h->domainW   = name ? _wcsdup(domain) : NULL; \
+        h->domainW   = domain ? _wcsdup(domain) : NULL; \
         h->type = type; \
         h->index = 0; \
         h->next   = NULL; \
diff --git a/third-party/tbb/src/tbb/tools_api/ittnotify_static.c b/third-party/tbb/src/tbb/tools_api/ittnotify_static.c
index ab396d20..c3a53bf0 100644
--- a/third-party/tbb/src/tbb/tools_api/ittnotify_static.c
+++ b/third-party/tbb/src/tbb/tools_api/ittnotify_static.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -81,7 +81,7 @@ static const char api_version[] = API_VERSION "\0\n@(#) $Revision$\n";
 
 #if ITT_OS==ITT_OS_WIN
 static const char* ittnotify_lib_name = "libittnotify.dll";
-#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD
+#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD|| ITT_OS==ITT_OS_OPENBSD
 static const char* ittnotify_lib_name = "libittnotify.so";
 #elif ITT_OS==ITT_OS_MAC
 static const char* ittnotify_lib_name = "libittnotify.dylib";
diff --git a/third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h b/third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h
index b5999c2a..837bc480 100644
--- a/third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h
+++ b/third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -245,7 +245,7 @@ extern "C" {
  *     only pauses tracing and analyzing memory access.
  *     It does not pause tracing or analyzing threading APIs.
  *   .
- * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ * - Intel(R) VTune(TM) Profiler:
  *   - Does continue to record when new threads are started.
  *   .
  * - Other effects:
diff --git a/third-party/tbb/src/tbb/waiters.h b/third-party/tbb/src/tbb/waiters.h
index e2aa4abc..8ed431f8 100644
--- a/third-party/tbb/src/tbb/waiters.h
+++ b/third-party/tbb/src/tbb/waiters.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -58,6 +58,24 @@ class outermost_worker_waiter : public waiter_base {
         __TBB_ASSERT(t == nullptr, nullptr);
 
         if (is_worker_should_leave(slot)) {
+            if (!governor::hybrid_cpu()) {
+                static constexpr std::chrono::microseconds worker_wait_leave_duration(1000);
+                static_assert(worker_wait_leave_duration > std::chrono::steady_clock::duration(1), "Clock resolution is not enough for measured interval.");
+
+                for (auto t1 = std::chrono::steady_clock::now(), t2 = t1;
+                    std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1) < worker_wait_leave_duration;
+                    t2 = std::chrono::steady_clock::now())
+                {
+                    if (!my_arena.is_empty() && !my_arena.is_recall_requested()) {
+                        return true;
+                    }
+
+                    if (my_arena.my_threading_control->is_any_other_client_active()) {
+                        break;
+                    }
+                    d0::yield();
+                }
+            }
             // Leave dispatch loop
             return false;
         }
@@ -114,6 +132,7 @@ class sleep_waiter : public waiter_base {
     void sleep(std::uintptr_t uniq_tag, Pred wakeup_condition) {
         my_arena.get_waiting_threads_monitor().wait<thread_control_monitor::thread_context>(wakeup_condition,
             market_context{uniq_tag, &my_arena});
+        reset_wait();
     }
 };
 
@@ -139,7 +158,6 @@ class external_waiter : public sleep_waiter {
         auto wakeup_condition = [&] { return !my_arena.is_empty() || !my_wait_ctx.continue_execution(); };
 
         sleep(std::uintptr_t(&my_wait_ctx), wakeup_condition);
-        my_backoff.reset_wait();
     }
 
     d1::wait_context* wait_ctx() {
@@ -176,11 +194,6 @@ class coroutine_waiter : public sleep_waiter {
         auto wakeup_condition = [&] { return !my_arena.is_empty() || sp->m_is_owner_recalled.load(std::memory_order_relaxed); };
 
         sleep(std::uintptr_t(sp), wakeup_condition);
-        my_backoff.reset_wait();
-    }
-
-    void reset_wait() {
-        my_backoff.reset_wait();
     }
 
     d1::wait_context* wait_ctx() {
diff --git a/third-party/tbb/src/tbbbind/CMakeLists.txt b/third-party/tbb/src/tbbbind/CMakeLists.txt
index 24cd3e5d..993dc8b8 100644
--- a/third-party/tbb/src/tbbbind/CMakeLists.txt
+++ b/third-party/tbb/src/tbbbind/CMakeLists.txt
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if (DEFINED CMAKE_SKIP_BUILD_RPATH)
-    set(CMAKE_SKIP_BUILD_RPATH_OLD_VALUE ${CMAKE_SKIP_BUILD_RPATH})
-endif()
 set(CMAKE_SKIP_BUILD_RPATH TRUE)
 
 function(tbbbind_build TBBBIND_NAME REQUIRED_HWLOC_TARGET)
@@ -106,10 +103,3 @@ else()
     tbbbind_build(tbbbind_2_5 HWLOC::hwloc_2_5 )
 endif()
 
-
-if (DEFINED CMAKE_SKIP_BUILD_RPATH_OLD_VALUE)
-    set(CMAKE_SKIP_BUILD_RPATH ${CMAKE_SKIP_BUILD_RPATH_OLD_VALUE})
-    unset(CMAKE_SKIP_BUILD_RPATH_OLD_VALUE)
-else()
-    unset(CMAKE_SKIP_BUILD_RPATH)
-endif()
diff --git a/third-party/tbb/src/tbbbind/tbb_bind.rc b/third-party/tbb/src/tbbbind/tbb_bind.rc
index bc060353..2d2b806e 100644
--- a/third-party/tbb/src/tbbbind/tbb_bind.rc
+++ b/third-party/tbb/src/tbbbind/tbb_bind.rc
@@ -1,4 +1,4 @@
-// Copyright (c) 2005-2023 Intel Corporation
+// Copyright (c) 2005-2024 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ BEGIN
             VALUE "CompanyName", "Intel Corporation\0"
             VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0"
             VALUE "FileVersion", TBB_VERSION "\0"
-            VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation.  All Rights Reserved.\0"
+            VALUE "LegalCopyright", "Copyright 2005-2024 Intel Corporation.  All Rights Reserved.\0"
             VALUE "LegalTrademarks", "\0"
 #ifndef TBB_USE_DEBUG
             VALUE "OriginalFilename", "tbbbind.dll\0"
diff --git a/third-party/tbb/src/tbbmalloc/CMakeLists.txt b/third-party/tbb/src/tbbmalloc/CMakeLists.txt
index 0386daa3..76044fce 100644
--- a/third-party/tbb/src/tbbmalloc/CMakeLists.txt
+++ b/third-party/tbb/src/tbbmalloc/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -109,5 +109,15 @@ target_link_libraries(tbbmalloc
     ${TBB_COMMON_LINK_LIBS}
 )
 
-tbb_install_target(tbbmalloc)
+if(TBB_BUILD_APPLE_FRAMEWORKS)
+    set_target_properties(tbbmalloc PROPERTIES 
+        FRAMEWORK TRUE
+        FRAMEWORK_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
+        XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbbmalloc
+        MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbbmalloc
+        MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
+        MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBBMALLOC_BINARY_VERSION}
+    )
+endif()
 
+tbb_install_target(tbbmalloc)
diff --git a/third-party/tbb/src/tbbmalloc/TypeDefinitions.h b/third-party/tbb/src/tbbmalloc/TypeDefinitions.h
index 81149166..bfadf61d 100644
--- a/third-party/tbb/src/tbbmalloc/TypeDefinitions.h
+++ b/third-party/tbb/src/tbbmalloc/TypeDefinitions.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2023 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #       define __ARCH_ipf 1
 #   elif defined(_M_IX86)||defined(__i386__) // the latter for MinGW support
 #       define __ARCH_x86_32 1
-#   elif defined(_M_ARM) || defined(_M_ARM64)
+#   elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) // the latter for MinGW support
 #       define __ARCH_other 1
 #   else
 #       error Unknown processor architecture for Windows
diff --git a/third-party/tbb/src/tbbmalloc/frontend.cpp b/third-party/tbb/src/tbbmalloc/frontend.cpp
index c657d804..77f9d659 100644
--- a/third-party/tbb/src/tbbmalloc/frontend.cpp
+++ b/third-party/tbb/src/tbbmalloc/frontend.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -1521,7 +1521,7 @@ bool Block::readyToShare()
     {
         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
         if ( (oldVal=publicFreeList)==nullptr )
-            (intptr_t&)(publicFreeList) = UNUSABLE;
+            publicFreeList = reinterpret_cast<FreeObject *>(UNUSABLE);
     }
 #endif
     return oldVal==nullptr;
diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc.rc b/third-party/tbb/src/tbbmalloc/tbbmalloc.rc
index 77e87ff5..2821adda 100644
--- a/third-party/tbb/src/tbbmalloc/tbbmalloc.rc
+++ b/third-party/tbb/src/tbbmalloc/tbbmalloc.rc
@@ -1,4 +1,4 @@
-// Copyright (c) 2005-2023 Intel Corporation
+// Copyright (c) 2005-2024 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ BEGIN
             VALUE "CompanyName", "Intel Corporation\0"
             VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0"
             VALUE "FileVersion", TBB_VERSION "\0"
-            VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation.  All Rights Reserved.\0"
+            VALUE "LegalCopyright", "Copyright 2005-2024 Intel Corporation.  All Rights Reserved.\0"
             VALUE "LegalTrademarks", "\0"
 #ifndef TBB_USE_DEBUG
             VALUE "OriginalFilename", "tbbmalloc.dll\0"
diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h b/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h
index c81dc060..44fa47aa 100644
--- a/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h
+++ b/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -102,7 +102,11 @@ void suppress_unused_warning( const T& ) {}
 /*
  * Default huge page size
  */
+#if defined __loongarch64
+static const size_t HUGE_PAGE_SIZE = 32 * 1024 * 1024;
+#else
 static const size_t HUGE_PAGE_SIZE = 2 * 1024 * 1024;
+#endif
 
 /********** End of global default constants *********/
 
diff --git a/third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt b/third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt
index 5c23f15d..554ddc85 100644
--- a/third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt
+++ b/third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -90,4 +90,14 @@ target_link_libraries(tbbmalloc_proxy
     ${TBB_COMMON_LINK_LIBS}
 )
 
+if(TBB_BUILD_APPLE_FRAMEWORKS)
+    set_target_properties(tbbmalloc_proxy PROPERTIES 
+        FRAMEWORK TRUE
+        FRAMEWORK_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
+        XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbbmalloc-proxy
+        MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbbmalloc-proxy
+        MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
+        MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBBMALLOC_BINARY_VERSION})
+endif()
+
 tbb_install_target(tbbmalloc_proxy)
diff --git a/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc b/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc
index 20b3b480..1884b119 100644
--- a/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc
+++ b/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc
@@ -1,4 +1,4 @@
-// Copyright (c) 2005-2023 Intel Corporation
+// Copyright (c) 2005-2024 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ BEGIN
             VALUE "CompanyName", "Intel Corporation\0"
             VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0"
             VALUE "FileVersion", TBB_VERSION "\0"
-            VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation.  All Rights Reserved.\0"
+            VALUE "LegalCopyright", "Copyright 2005-2024 Intel Corporation.  All Rights Reserved.\0"
             VALUE "LegalTrademarks", "\0"
 #ifndef TBB_USE_DEBUG
             VALUE "OriginalFilename", "tbbmalloc_proxy.dll\0"
diff --git a/third-party/tbb/test/CMakeLists.txt b/third-party/tbb/test/CMakeLists.txt
index 0e0b3966..cfde681b 100644
--- a/third-party/tbb/test/CMakeLists.txt
+++ b/third-party/tbb/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,6 +29,9 @@ function(tbb_add_test)
     add_executable(${_tbb_test_TARGET_NAME} ${_tbb_test_SUBDIR}/${_tbb_test_NAME}.cpp)
     target_include_directories(${_tbb_test_TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR})
 
+    # cmake>=3.4 no longer adds flags to export symbols from executables (CMP0065)
+    set_property(TARGET ${_tbb_test_TARGET_NAME} PROPERTY ENABLE_EXPORTS TRUE)
+
     target_compile_options(${_tbb_test_TARGET_NAME}
         PRIVATE
         ${TBB_CXX_STD_FLAG}
@@ -40,6 +43,10 @@ function(tbb_add_test)
         ${TBB_COMMON_COMPILE_FLAGS}
     )
 
+    if (TBB_BUILD_APPLE_FRAMEWORKS)
+        add_compile_definitions(TBB_USE_APPLE_FRAMEWORKS)
+    endif()
+
     if (ANDROID_PLATFORM)
         # Expand the linker rpath by the CMAKE_LIBRARY_OUTPUT_DIRECTORY path since clang compiler from Android SDK
         # doesn't respect the -L flag.
@@ -558,7 +565,7 @@ if (TARGET TBB::tbb)
         target_include_directories(test_implicit_linkage_on_windows PRIVATE
         $<TARGET_PROPERTY:TBB::tbb,INTERFACE_INCLUDE_DIRECTORIES>)
         set_target_properties(test_implicit_linkage_on_windows PROPERTIES
-        LINK_OPTIONS /LIBPATH:$<TARGET_LINKER_FILE_DIR:TBB::tbb>)
+        LINK_OPTIONS LINKER:/LIBPATH:$<TARGET_LINKER_FILE_DIR:TBB::tbb>)
         add_dependencies(test_implicit_linkage_on_windows TBB::tbb)
     endif()
 endif()
@@ -590,39 +597,39 @@ if (TARGET TBB::tbbmalloc)
         endif()
         # ----------------------------------------------------------------------------------------
         # Whitebox testing
-
-        add_executable(test_malloc_whitebox tbbmalloc/test_malloc_whitebox.cpp)
-
-        target_include_directories(test_malloc_whitebox
-            PRIVATE
-            ${CMAKE_CURRENT_SOURCE_DIR}/../include
-            ${CMAKE_CURRENT_SOURCE_DIR}/..
-            ${CMAKE_CURRENT_SOURCE_DIR})
-        target_compile_definitions(test_malloc_whitebox PRIVATE __TBBMALLOC_BUILD)
-        target_compile_options(test_malloc_whitebox
-            PRIVATE
-            ${TBB_CXX_STD_FLAG}
-            ${TBB_WARNING_SUPPRESS}
-            ${TBB_TEST_COMPILE_FLAGS}
-            ${TBB_COMMON_COMPILE_FLAGS}
-            ${TBBMALLOC_LIB_COMPILE_FLAGS}
-        )
-        if (ANDROID_PLATFORM)
-            add_test(NAME test_malloc_whitebox
-                    COMMAND ${CMAKE_COMMAND}
-                            -DBINARIES_PATH=${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
-                            -DTEST_NAME=test_malloc_whitebox
-                            -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake)
-        else()
-            add_test(NAME test_malloc_whitebox COMMAND test_malloc_whitebox --force-colors=1)
-        endif()
-        if (COMMAND target_link_options)
-            target_link_options(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS})
-        else()
-            target_link_libraries(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS})
+        if (NOT TBB_EMSCRIPTEN)
+	    add_executable(test_malloc_whitebox tbbmalloc/test_malloc_whitebox.cpp)
+
+            target_include_directories(test_malloc_whitebox
+                PRIVATE
+                ${CMAKE_CURRENT_SOURCE_DIR}/../include
+                ${CMAKE_CURRENT_SOURCE_DIR}/..
+                ${CMAKE_CURRENT_SOURCE_DIR})
+            target_compile_definitions(test_malloc_whitebox PRIVATE __TBBMALLOC_BUILD)
+            target_compile_options(test_malloc_whitebox
+                PRIVATE
+                ${TBB_CXX_STD_FLAG}
+                ${TBB_WARNING_SUPPRESS}
+                ${TBB_TEST_COMPILE_FLAGS}
+                ${TBB_COMMON_COMPILE_FLAGS}
+                ${TBBMALLOC_LIB_COMPILE_FLAGS}
+            )
+            if (ANDROID_PLATFORM)
+                add_test(NAME test_malloc_whitebox
+                        COMMAND ${CMAKE_COMMAND}
+                                -DBINARIES_PATH=${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+                                -DTEST_NAME=test_malloc_whitebox
+                                -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake)
+            else()
+                add_test(NAME test_malloc_whitebox COMMAND test_malloc_whitebox --force-colors=1)
+            endif()
+            if (COMMAND target_link_options)
+                target_link_options(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS})
+            else()
+                target_link_libraries(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS})
+            endif()
+            target_link_libraries(test_malloc_whitebox PRIVATE Threads::Threads ${TBB_COMMON_LINK_LIBS})
         endif()
-        target_link_libraries(test_malloc_whitebox PRIVATE Threads::Threads ${TBB_COMMON_LINK_LIBS})
-
         # ------------------------------------------------------------------------------------------
 
         # Define TBB malloc conformance tests
diff --git a/third-party/tbb/test/common/utils_concurrency_limit.h b/third-party/tbb/test/common/utils_concurrency_limit.h
index 4b1e8d20..9d0b3c77 100644
--- a/third-party/tbb/test/common/utils_concurrency_limit.h
+++ b/third-party/tbb/test/common/utils_concurrency_limit.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2022 Intel Corporation
+    Copyright (c) 2020-2023 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -287,27 +287,44 @@ bool can_change_thread_priority() {
     return false;
 }
 
-void increase_thread_priority() {
 #if __unix__
-    pthread_t this_thread = pthread_self();
-    sched_param params;
-    params.sched_priority = sched_get_priority_max(SCHED_FIFO);
-    ASSERT(params.sched_priority != -1, nullptr);
-    int err = pthread_setschedparam(this_thread, SCHED_FIFO, &params);
-    ASSERT(err == 0, "Can not change thread priority.");
-#endif
-}
+class increased_priority_guard {
+public:
+    increased_priority_guard() : m_backup(get_current_schedparam()) {
+        increase_thread_priority();
+    }
 
-void decrease_thread_priority() {
-#if __unix__
-    pthread_t this_thread = pthread_self();
-    sched_param params;
-    params.sched_priority = sched_get_priority_min(SCHED_FIFO);
-    ASSERT(params.sched_priority != -1, nullptr);
-    int err = pthread_setschedparam(this_thread, SCHED_FIFO, &params);
-    ASSERT(err == 0, "Can not change thread priority.");
+    ~increased_priority_guard() {
+        // restore priority on destruction
+        pthread_t this_thread = pthread_self();
+        int err = pthread_setschedparam(this_thread, 
+            /*policy*/ m_backup.first, /*sched_param*/ &m_backup.second);
+        ASSERT(err == 0, nullptr);
+    }
+private:
+    std::pair<int, sched_param> get_current_schedparam() {
+        pthread_t this_thread = pthread_self();
+        sched_param params;
+        int policy = 0;
+        int err = pthread_getschedparam(this_thread, &policy, &params);
+        ASSERT(err == 0, nullptr);
+        return std::make_pair(policy, params);
+    }
+
+    void increase_thread_priority() {
+        pthread_t this_thread = pthread_self();
+        sched_param params;
+        params.sched_priority = sched_get_priority_max(SCHED_FIFO);
+        ASSERT(params.sched_priority != -1, nullptr);
+        int err = pthread_setschedparam(this_thread, SCHED_FIFO, &params);
+        ASSERT(err == 0, "Can not change thread priority.");
+    }
+
+    std::pair<int, sched_param> m_backup;
+};
+#else
+    class increased_priority_guard{};
 #endif
-}
 
 } // namespace utils
 
diff --git a/third-party/tbb/test/common/utils_dynamic_libs.h b/third-party/tbb/test/common/utils_dynamic_libs.h
index c84beac7..5e5365fc 100644
--- a/third-party/tbb/test/common/utils_dynamic_libs.h
+++ b/third-party/tbb/test/common/utils_dynamic_libs.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -46,9 +46,17 @@ namespace utils {
 #endif
 #define EXT ".dll"
 #else
+#if TBB_USE_APPLE_FRAMEWORKS
+#define PREFIX // When built as Apple* Framework, the binary has no lib prefix
+#else
 #define PREFIX "lib"
+#endif
 #if __APPLE__
+#if TBB_USE_APPLE_FRAMEWORKS
+#define EXT // When built as Apple* Framework, the binary has no extension
+#else
 #define EXT ".dylib"
+#endif
 // Android SDK build system does not support .so file name versioning
 #elif __FreeBSD__ || __NetBSD__ || __sun || _AIX || __ANDROID__
 #define EXT ".so"
@@ -58,10 +66,15 @@ namespace utils {
 #error Unknown OS
 #endif
 #endif
+#if TBB_USE_APPLE_FRAMEWORKS
+#define MALLOCFRAMEWORK "tbbmalloc.framework/"
+#else
+#define MALLOCFRAMEWORK
+#endif
 
 // Form the names of the TBB memory allocator binaries.
-#define MALLOCLIB_NAME1 PREFIX "tbbmalloc" SUFFIX1 EXT
-#define MALLOCLIB_NAME2 PREFIX "tbbmalloc" SUFFIX2 EXT
+#define MALLOCLIB_NAME1 MALLOCFRAMEWORK PREFIX "tbbmalloc" SUFFIX1 EXT
+#define MALLOCLIB_NAME2 MALLOCFRAMEWORK PREFIX "tbbmalloc" SUFFIX2 EXT
 
 #if _WIN32 || _WIN64
 using LIBRARY_HANDLE = HMODULE;
diff --git a/third-party/tbb/test/conformance/conformance_blocked_rangeNd.cpp b/third-party/tbb/test/conformance/conformance_blocked_rangeNd.cpp
index de54169c..52faac52 100644
--- a/third-party/tbb/test/conformance/conformance_blocked_rangeNd.cpp
+++ b/third-party/tbb/test/conformance/conformance_blocked_rangeNd.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2017-2021 Intel Corporation
+    Copyright (c) 2017-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -245,6 +245,7 @@ TEST_CASE("Serial test") {
     SerialTest<N>();
 }
 
+#if !EMSCRIPTEN
 //! Testing blocked_rangeNd interface with parallel_for
 //! \brief \ref requirement
 TEST_CASE("Parallel test") {
@@ -253,6 +254,7 @@ TEST_CASE("Parallel test") {
         ParallelTest<N>();
     }
 }
+#endif
 
 //! Testing blocked_rangeNd with proportional splitting
 //! \brief \ref interface \ref requirement
diff --git a/third-party/tbb/test/conformance/conformance_parallel_for.cpp b/third-party/tbb/test/conformance/conformance_parallel_for.cpp
index 44903f06..463ea526 100644
--- a/third-party/tbb/test/conformance/conformance_parallel_for.cpp
+++ b/third-party/tbb/test/conformance/conformance_parallel_for.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -399,7 +399,9 @@ TEST_CASE("Flog test") {
     Flog<parallel_tag, 10>();
     Flog<parallel_tag, 100>();
     Flog<parallel_tag, 1000>();
+#if !EMSCRIPTEN    
     Flog<parallel_tag, 10000>();
+#endif
 }
 
 //! Testing parallel for with different types and step
diff --git a/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp b/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp
index ad8ee672..e36a2803 100644
--- a/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp
+++ b/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp
@@ -102,10 +102,8 @@ class ForEachInvokeItem {
     void do_action_and_feed(oneapi::tbb::feeder<ForEachInvokeItem>& feeder) const {
         CHECK_MESSAGE(change_vector.size() % 2 == 0, "incorrect test setup");
         std::size_t shift = change_vector.size() / 2;
-        std::cout << "Process " << real_value << std::endl;
         ++change_vector[real_value];
         if (real_value < shift) {
-            std::cout << "Add " << real_value + shift << std::endl;
             feeder.add(ForEachInvokeItem(real_value + shift, change_vector));
         }
     }
diff --git a/third-party/tbb/test/conformance/conformance_parallel_reduce.cpp b/third-party/tbb/test/conformance/conformance_parallel_reduce.cpp
index cf3aee9b..0214bfd9 100644
--- a/third-party/tbb/test/conformance/conformance_parallel_reduce.cpp
+++ b/third-party/tbb/test/conformance/conformance_parallel_reduce.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include "common/test_invoke.h"
 
 #include "../tbb/test_partitioner.h"
+#include <list>
 
 //! \file conformance_parallel_reduce.cpp
 //! \brief Test for [algorithms.parallel_reduce algorithms.parallel_deterministic_reduce] specification
@@ -56,6 +57,59 @@ struct ReduceBody {
     }
 };
 
+template <typename T>
+class MoveOnlyWrapper {
+public:
+    MoveOnlyWrapper() = default;
+    MoveOnlyWrapper(const T& obj) : my_obj(obj) {}
+
+    MoveOnlyWrapper(MoveOnlyWrapper&&) = default;
+    MoveOnlyWrapper& operator=(MoveOnlyWrapper&&) = default;
+
+    MoveOnlyWrapper(const MoveOnlyWrapper&) = delete;
+    MoveOnlyWrapper& operator=(const MoveOnlyWrapper&) = delete;
+
+    bool operator==(const MoveOnlyWrapper& other) const { return my_obj == other.my_obj; }
+private:
+    T my_obj;
+}; // class MoveOnlyWrapper
+
+// The container wrapper that is copyable but the copy constructor fails if the source container is non-empty
+// If such an empty container is provided as an identity into parallel reduce algorithm with rvalue-friendly body,
+// it should only call the copy constructor while broadcasting the identity element into the leafs
+// and the identity element is an empty container for the further test
+template <typename T>
+class EmptyCopyList {
+public:
+    EmptyCopyList() = default;
+
+    EmptyCopyList(EmptyCopyList&&) = default;
+    EmptyCopyList& operator=(EmptyCopyList&&) = default;
+
+    EmptyCopyList(const EmptyCopyList& other) {
+        REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list");
+    }
+    EmptyCopyList& operator=(const EmptyCopyList& other) {
+        REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list");
+        return *this;
+    }
+
+    typename std::list<T>::iterator insert(typename std::list<T>::const_iterator pos, T&& item) {
+        return my_list.insert(pos, std::move(item));
+    }
+
+    void splice(typename std::list<T>::const_iterator pos, EmptyCopyList&& other) {
+        my_list.splice(pos, std::move(other.my_list));
+    }
+
+    typename std::list<T>::const_iterator end() const { return my_list.end(); }
+
+    bool operator==(const EmptyCopyList& other) const { return my_list == other.my_list; }
+
+private:
+    std::list<T> my_list;
+}; // class EmptyCopyList
+
 template <class Partitioner>
 void TestDeterministicReductionFor() {
     const int N = 1000;
@@ -174,3 +228,109 @@ TEST_CASE("parallel_[deterministic_]reduce and std::invoke") {
 }
 
 #endif
+
+template <typename Runner, typename... PartitionerContext>
+void test_vector_of_lists_rvalue_reduce_basic(const Runner& runner, PartitionerContext&&... args) {
+    constexpr std::size_t n_vectors = 10000;
+
+    using inner_type = MoveOnlyWrapper<int>;
+    using list_type = EmptyCopyList<inner_type>;
+    using vector_of_lists_type = std::vector<list_type>;
+
+    vector_of_lists_type vector_of_lists;
+
+    vector_of_lists.reserve(n_vectors);
+    for (std::size_t i = 0; i < n_vectors; ++i) {
+        list_type list;
+
+        list.insert(list.end(), inner_type{1});
+        list.insert(list.end(), inner_type{2});
+        list.insert(list.end(), inner_type{3});
+        list.insert(list.end(), inner_type{4});
+        list.insert(list.end(), inner_type{5});
+        vector_of_lists.emplace_back(std::move(list));
+    }
+
+    oneapi::tbb::blocked_range<std::size_t> range(0, n_vectors, n_vectors * 2);
+
+    auto reduce_body = [&](const decltype(range)& range_obj, list_type&& x) {
+        list_type new_list = std::move(x);
+
+        for (std::size_t index = range_obj.begin(); index != range_obj.end(); ++index) {
+            new_list.splice(new_list.end(), std::move(vector_of_lists[index]));
+        }
+        return new_list;
+    };
+
+    auto join_body = [&](list_type&& x, list_type&& y) {
+        list_type new_list = std::move(x);
+
+        new_list.splice(new_list.end(), std::move(y));
+        return new_list;
+    };
+
+    list_type result = runner(range, list_type{}, reduce_body, join_body, std::forward<PartitionerContext>(args)...);
+
+    list_type expected_result;
+
+    for (std::size_t i = 0; i < n_vectors; ++i) {
+        expected_result.insert(expected_result.end(), inner_type{1});
+        expected_result.insert(expected_result.end(), inner_type{2});
+        expected_result.insert(expected_result.end(), inner_type{3});
+        expected_result.insert(expected_result.end(), inner_type{4});
+        expected_result.insert(expected_result.end(), inner_type{5});
+    }
+
+    REQUIRE_MESSAGE(expected_result == result, "Incorrect reduce result");
+}
+
+struct ReduceRunner {
+    template <typename... Args>
+    auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_reduce(std::forward<Args>(args)...)) {
+        return oneapi::tbb::parallel_reduce(std::forward<Args>(args)...);
+    }
+};
+
+struct DeterministicReduceRunner {
+    template <typename... Args>
+    auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_deterministic_reduce(std::forward<Args>(args)...)) {
+        return oneapi::tbb::parallel_deterministic_reduce(std::forward<Args>(args)...);
+    }
+};
+
+void test_vector_of_lists_rvalue_reduce() {
+    ReduceRunner runner;
+    oneapi::tbb::affinity_partitioner af_partitioner;
+    oneapi::tbb::task_group_context context;
+
+    test_vector_of_lists_rvalue_reduce_basic(runner);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{});
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{});
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{});
+    test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner);
+
+    test_vector_of_lists_rvalue_reduce_basic(runner, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{}, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner, context);
+}
+
+void test_vector_of_lists_rvalue_deterministic_reduce() {
+    DeterministicReduceRunner runner;
+    oneapi::tbb::task_group_context context;
+
+    test_vector_of_lists_rvalue_reduce_basic(runner);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{});
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{});
+
+    test_vector_of_lists_rvalue_reduce_basic(runner, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context);
+}
+
+//! \brief \ref interface \ref requirement
+TEST_CASE("test rvalue optimization") {
+    test_vector_of_lists_rvalue_reduce();
+    test_vector_of_lists_rvalue_deterministic_reduce();
+}
diff --git a/third-party/tbb/test/tbb/test_collaborative_call_once.cpp b/third-party/tbb/test/tbb/test_collaborative_call_once.cpp
index d8ee09fd..11a04a10 100644
--- a/third-party/tbb/test/tbb/test_collaborative_call_once.cpp
+++ b/third-party/tbb/test/tbb/test_collaborative_call_once.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2022 Intel Corporation
+    Copyright (c) 2022-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -206,6 +206,7 @@ TEST_CASE("only calls once - move only argument") {
     }
 }
 
+#if !EMSCRIPTEN
 //! Stress test for functor to be called only once
 //! \brief \ref interface \ref requirement \ref stress
 TEST_CASE("only calls once - stress test") {
@@ -246,7 +247,7 @@ TEST_CASE("only calls once - stress test") {
         });
     }
 }
-
+#endif
 #if TBB_USE_EXCEPTIONS
 
 //! Test for collaborative_call_once exception handling
@@ -324,6 +325,7 @@ TEST_CASE("handles exceptions - stress test") {
 
 #endif
 
+#if !EMSCRIPTEN
 //! Test for multiple help from moonlighting threads
 //! \brief \ref interface \ref requirement
 TEST_CASE("multiple help") {
@@ -341,6 +343,7 @@ TEST_CASE("multiple help") {
         });
     });
 }
+#endif
 
 //! Test for collaborative work from different arenas
 //! \brief \ref interface \ref requirement
diff --git a/third-party/tbb/test/tbb/test_eh_algorithms.cpp b/third-party/tbb/test/tbb/test_eh_algorithms.cpp
index 75c0381d..7a2b59b4 100644
--- a/third-party/tbb/test/tbb/test_eh_algorithms.cpp
+++ b/third-party/tbb/test/tbb/test_eh_algorithms.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -401,7 +401,7 @@ TEST_CASE("parallel_for and parallel_reduce exception handling test #0") {
         }
     }
 }
-
+#if !EMSCRIPTEN
 //! Testing parallel_for and parallel_reduce exception handling
 //! \brief \ref error_guessing
 TEST_CASE("parallel_for and parallel_reduce exception handling test #1") {
@@ -486,8 +486,8 @@ TEST_CASE("parallel_for and parallel_reduce exception handling test #4") {
     }
 }
 
+#endif
 #endif /* TBB_USE_EXCEPTIONS */
-
 class ParForBodyToCancel {
 public:
     void operator()( const range_type& ) const {
@@ -698,6 +698,7 @@ TEST_CASE("parallel_for and parallel_reduce cancellation test #1") {
     }
 }
 
+#if !EMSCRIPTEN
 //! Testing parallel_for and parallel_reduce cancellation
 //! \brief \ref error_guessing
 TEST_CASE("parallel_for and parallel_reduce cancellation test #2") {
@@ -718,6 +719,7 @@ TEST_CASE("parallel_for and parallel_reduce cancellation test #2") {
         }
     }
 }
+#endif
 
 //! Testing parallel_for and parallel_reduce cancellation
 //! \brief \ref error_guessing
@@ -1033,6 +1035,7 @@ void Test5_parallel_for_each () {
     }
 } // void Test5_parallel_for_each ()
 
+#if !EMSCRIPTEN
 //! Testing parallel_for_each exception handling
 //! \brief \ref error_guessing
 TEST_CASE("parallel_for_each exception handling test #1") {
@@ -1053,6 +1056,7 @@ TEST_CASE("parallel_for_each exception handling test #1") {
         }
     }
 }
+#endif
 
 //! Testing parallel_for_each exception handling
 //! \brief \ref error_guessing
@@ -1075,6 +1079,7 @@ TEST_CASE("parallel_for_each exception handling test #2") {
     }
 }
 
+#if !EMSCRIPTEN
 //! Testing parallel_for_each exception handling
 //! \brief \ref error_guessing
 TEST_CASE("parallel_for_each exception handling test #3") {
@@ -1095,6 +1100,7 @@ TEST_CASE("parallel_for_each exception handling test #3") {
         }
     }
 }
+#endif
 
 //! Testing parallel_for_each exception handling
 //! \brief \ref error_guessing
@@ -1117,6 +1123,7 @@ TEST_CASE("parallel_for_each exception handling test #4") {
     }
 }
 
+#if !EMSCRIPTEN
 //! Testing parallel_for_each exception handling
 //! \brief \ref error_guessing
 TEST_CASE("parallel_for_each exception handling test #5") {
@@ -1139,7 +1146,7 @@ TEST_CASE("parallel_for_each exception handling test #5") {
         }
     }
 }
-
+#endif
 #endif /* TBB_USE_EXCEPTIONS */
 
 class ParForEachBodyToCancel {
@@ -1217,6 +1224,7 @@ void TestCancelation2_parallel_for_each () {
     RunCancellationTest<ParForEachWorker<body_to_cancel, Iterator>, Cancellator2>();
 }
 
+#if !EMSCRIPTEN
 //! Testing parallel_for_each cancellation test
 //! \brief \ref error_guessing
 TEST_CASE("parallel_for_each cancellation test #1") {
@@ -1257,6 +1265,7 @@ TEST_CASE("parallel_for_each cancellation test #2") {
         }
     }
 }
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // Tests for tbb::parallel_pipeline
@@ -1608,6 +1617,7 @@ void TestWithDifferentFiltersAndConcurrency() {
 #endif
 }
 
+#if !EMSCRIPTEN
 //! Testing parallel_pipeline exception handling
 //! \brief \ref error_guessing
 TEST_CASE("parallel_pipeline exception handling test #1") {
@@ -1631,7 +1641,7 @@ TEST_CASE("parallel_pipeline exception handling test #3") {
 TEST_CASE("parallel_pipeline exception handling test #4") {
     TestWithDifferentFiltersAndConcurrency<Test4_pipeline>();
 }
-
+#endif
 #endif /* TBB_USE_EXCEPTIONS */
 
 class FilterToCancel  {
@@ -1727,6 +1737,7 @@ TEST_CASE("parallel_pipeline cancellation test #1") {
     }
 }
 
+#if !EMSCRIPTEN
 //! Testing parallel_pipeline cancellation
 //! \brief \ref error_guessing
 TEST_CASE("parallel_pipeline cancellation test #2") {
@@ -1748,3 +1759,4 @@ TEST_CASE("parallel_pipeline cancellation test #2") {
         }
     }
 }
+#endif
diff --git a/third-party/tbb/test/tbb/test_eh_flow_graph.cpp b/third-party/tbb/test/tbb/test_eh_flow_graph.cpp
index 71f38156..015d196e 100644
--- a/third-party/tbb/test/tbb/test_eh_flow_graph.cpp
+++ b/third-party/tbb/test/tbb/test_eh_flow_graph.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -2017,6 +2017,7 @@ void TestOneThreadNum(int nThread) {
     );
 }
 
+#if !EMSCRIPTEN
 //! Test exceptions with parallelism
 //! \brief \ref error_guessing
 TEST_CASE("Testing several threads"){
@@ -2026,5 +2027,5 @@ TEST_CASE("Testing several threads"){
         TestOneThreadNum(nThread);
     }
 }
-
+#endif
 #endif // TBB_USE_EXCEPTIONS
diff --git a/third-party/tbb/test/tbb/test_eh_thread.cpp b/third-party/tbb/test/tbb/test_eh_thread.cpp
index d5af9db6..a5ac1c8a 100644
--- a/third-party/tbb/test/tbb/test_eh_thread.cpp
+++ b/third-party/tbb/test/tbb/test_eh_thread.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2022 Intel Corporation
+    Copyright (c) 2020-2023 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ class Thread {
         mValid = false;
         pthread_attr_t attr;
         // Limit the stack size not to consume all virtual memory on 32 bit platforms.
-        std::size_t stacksize = utils::max(128*1024, PTHREAD_STACK_MIN);
+        std::size_t stacksize = utils::max(std::size_t(128*1024), std::size_t(PTHREAD_STACK_MIN));
         if (pthread_attr_init(&attr) == 0 && pthread_attr_setstacksize(&attr, stacksize) == 0) {
             mValid = pthread_create(&mHandle, &attr, thread_routine, /* arg = */ nullptr) == 0;
         }
diff --git a/third-party/tbb/test/tbb/test_flow_graph_priorities.cpp b/third-party/tbb/test/tbb/test_flow_graph_priorities.cpp
index 5c798063..483daadb 100644
--- a/third-party/tbb/test/tbb/test_flow_graph_priorities.cpp
+++ b/third-party/tbb/test/tbb/test_flow_graph_priorities.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2018-2021 Intel Corporation
+    Copyright (c) 2018-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -842,6 +842,7 @@ TEST_CASE("Priority nodes take precedence"){
     }
 }
 
+#if !EMSCRIPTEN
 //! Test thread eager reaction
 //! \brief \ref error_guessing
 TEST_CASE("Thread eager reaction"){
@@ -849,6 +850,7 @@ TEST_CASE("Thread eager reaction"){
         ThreadsEagerReaction::test( static_cast<int>(p) );
     }
 }
+#endif
 
 //! Test prioritization under concurrency limits
 //! \brief \ref error_guessing
@@ -888,3 +890,4 @@ TEST_CASE("Exceptions") {
     Exceptions::test();
 }
 #endif
+
diff --git a/third-party/tbb/test/tbb/test_fuzzing.cpp b/third-party/tbb/test/tbb/test_fuzzing.cpp
index 6571ae0d..38cd7f8a 100644
--- a/third-party/tbb/test/tbb/test_fuzzing.cpp
+++ b/third-party/tbb/test/tbb/test_fuzzing.cpp
@@ -15,7 +15,7 @@
 */
 
 //! \file test_fuzzing.cpp
-//! \brief Test the [fuzzing] of environment variables
+//! \brief Test the [internal] of environment variables
 
 #include <fuzzer/FuzzedDataProvider.h>
 
diff --git a/third-party/tbb/test/tbb/test_global_control.cpp b/third-party/tbb/test/tbb/test_global_control.cpp
index 0c3df3bf..fddbbaf6 100644
--- a/third-party/tbb/test/tbb/test_global_control.cpp
+++ b/third-party/tbb/test/tbb/test_global_control.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -245,11 +245,13 @@ TEST_CASE("prolong lifetime advanced") {
 }
 #endif
 
+#if !EMSCRIPTEN
 //! Testing multiple wait
 //! \brief \ref error_guessing
 TEST_CASE("prolong lifetime multiple wait") {
     TestBlockingTerminateNS::TestMultpleWait();
 }
+#endif
 
 //! \brief \ref regression
 TEST_CASE("test concurrent task_scheduler_handle destruction") {
diff --git a/third-party/tbb/test/tbb/test_mutex.cpp b/third-party/tbb/test/tbb/test_mutex.cpp
index bc7b79e3..5b78f173 100644
--- a/third-party/tbb/test/tbb/test_mutex.cpp
+++ b/third-party/tbb/test/tbb/test_mutex.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -109,7 +109,6 @@ void TestTransaction(const char* name)
     REQUIRE_MESSAGE(n_transactions_attempted.load(std::memory_order_relaxed), "ERROR for " << name << ": transactions were never attempted");
 }
 
-
 //! \brief \ref error_guessing
 TEST_CASE("Transaction test") {
     if (have_TSX()) {
@@ -119,6 +118,7 @@ TEST_CASE("Transaction test") {
 }
 #endif /* __TBB_TSX_TESTING_ENABLED_FOR_THIS_COMPILER */
 
+
 //! \brief \ref error_guessing
 TEST_CASE("test upgrade/downgrade with spin_rw_mutex") {
     test_rwm_upgrade_downgrade<tbb::spin_rw_mutex>();
@@ -144,10 +144,12 @@ TEST_CASE("test spin_mutex with native threads") {
     test_with_native_threads::test<tbb::spin_mutex>();
 }
 
+#if !EMSCRIPTEN
 //! \brief \ref error_guessing
 TEST_CASE("test queuing_mutex with native threads") {
     test_with_native_threads::test<tbb::queuing_mutex>();
 }
+#endif
 
 //! \brief \ref error_guessing
 TEST_CASE("test mutex with native threads") {
@@ -160,11 +162,13 @@ TEST_CASE("test spin_rw_mutex with native threads") {
     test_with_native_threads::test_rw<tbb::spin_rw_mutex>();
 }
 
+#if !EMSCRIPTEN
 //! \brief \ref error_guessing
 TEST_CASE("test queuing_rw_mutex with native threads") {
     test_with_native_threads::test<tbb::queuing_rw_mutex>();
     test_with_native_threads::test_rw<tbb::queuing_rw_mutex>();
 }
+#endif
 
 //! \brief \ref error_guessing
 TEST_CASE("test rw_mutex with native threads") {
@@ -197,3 +201,4 @@ TEST_CASE("internal mutex concepts") {
                              tbb::null_rw_mutex, tbb::queuing_rw_mutex>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
diff --git a/third-party/tbb/test/tbb/test_parallel_for_each.cpp b/third-party/tbb/test/tbb/test_parallel_for_each.cpp
index f6bb5090..3dfc107e 100644
--- a/third-party/tbb/test/tbb/test_parallel_for_each.cpp
+++ b/third-party/tbb/test/tbb/test_parallel_for_each.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2023 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -22,6 +22,105 @@
 //! \file test_parallel_for_each.cpp
 //! \brief Test for [algorithms.parallel_for_each]
 
+#if __TBB_CPP20_PRESENT
+// Fancy iterator type that models the C++20 iterator type
+// that defines the real iterator category using iterator_concept type
+// and iterator_category is always std::input_iterator_type
+// Similar iterators are used by C++20 ranges (e.g. std::ranges::iota_view::iterator)
+// parallel_for_each algorithm should detect such iterators with respect to iterator_concept value
+
+template <typename T, typename Category>
+struct cpp20_iterator {
+    static_assert(std::derived_from<Category, std::forward_iterator_tag>,
+                  "cpp20_iterator should be of at least forward iterator category");
+
+    using iterator_concept = Category;
+    using iterator_category = std::input_iterator_tag;
+    using value_type = T;
+    using difference_type = std::ptrdiff_t;
+
+    cpp20_iterator() = default;
+    explicit cpp20_iterator(T* ptr) : my_ptr(ptr) {}
+
+    T& operator*() const { return *my_ptr; }
+
+    cpp20_iterator& operator++() {
+        ++my_ptr;
+        return *this;
+    }
+
+    cpp20_iterator operator++(int) {
+        auto it = *this;
+        ++*this;
+        return it;
+    }
+
+    cpp20_iterator& operator--()
+        requires std::derived_from<Category, std::bidirectional_iterator_tag>
+    {
+        --my_ptr;
+        return *this;
+    }
+
+    cpp20_iterator operator--(int)
+        requires std::derived_from<Category, std::bidirectional_iterator_tag>
+    {
+        auto it = *this;
+        --*this;
+        return it;
+    }
+
+    cpp20_iterator& operator+=(difference_type n)
+        requires std::derived_from<Category, std::random_access_iterator_tag>
+    {
+        my_ptr += n;
+        return *this;
+    }
+
+    cpp20_iterator& operator-=(difference_type n)
+        requires std::derived_from<Category, std::random_access_iterator_tag>
+    {
+        my_ptr -= n;
+        return *this;
+    }
+
+    T& operator[](difference_type n) const
+        requires std::derived_from<Category, std::random_access_iterator_tag>
+    {
+        return my_ptr[n];
+    }
+
+    friend bool operator==(const cpp20_iterator&, const cpp20_iterator&) = default;
+
+    friend auto operator<=>(const cpp20_iterator&, const cpp20_iterator&)
+        requires std::derived_from<Category, std::random_access_iterator_tag> = default;
+
+    friend cpp20_iterator operator+(cpp20_iterator i, difference_type n)
+        requires std::derived_from<Category, std::random_access_iterator_tag>
+    {
+        return cpp20_iterator(i.my_ptr + n);
+    }
+
+    friend cpp20_iterator operator+(difference_type n, cpp20_iterator i)
+        requires std::derived_from<Category, std::random_access_iterator_tag>
+    {
+        return i + n;
+    }
+
+    friend cpp20_iterator operator-(cpp20_iterator i, difference_type n)
+        requires std::derived_from<Category, std::random_access_iterator_tag>
+    {
+        return cpp20_iterator(i.my_ptr - n);
+    }
+
+    friend difference_type operator-(const cpp20_iterator& x, const cpp20_iterator& y) {
+        return x.my_ptr - y.my_ptr;
+    }
+private:
+    T* my_ptr = nullptr;
+}; // class cpp20_iterator
+#endif // __TBB_CPP20_PRESENT
+
 //! Test forward access iterator support
 //! \brief \ref error_guessing \ref interface
 TEST_CASE("Forward iterator support") {
@@ -172,3 +271,65 @@ TEST_CASE("parallel_for_each constraints") {
 }
 
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_CPP20_PRESENT
+
+struct no_copy_move {
+    no_copy_move() = default;
+
+    no_copy_move(const no_copy_move&) = delete;
+    no_copy_move(no_copy_move&&) = delete;
+
+    no_copy_move& operator=(const no_copy_move&) = delete;
+    no_copy_move& operator=(no_copy_move&&) = delete;
+
+    int item = 0;
+};
+
+template <typename Category>
+void test_with_cpp20_iterator() {
+    constexpr std::size_t n = 1'000'000;
+
+    std::vector<no_copy_move> elements(n);
+
+    cpp20_iterator<no_copy_move, Category> begin(elements.data());
+    cpp20_iterator<no_copy_move, Category> end(elements.data() + n);
+
+    oneapi::tbb::parallel_for_each(begin, end, [](no_copy_move& element) {
+        element.item = 42;
+    });
+
+    for (std::size_t index = 0; index < n; ++index) {
+        CHECK(elements[index].item == 42);
+    }
+}
+
+//! \brief \ref error_guessing \ref regression
+TEST_CASE("parallel_for_each with cpp20 iterator") {
+    // Test that parallel_for_each threats ignores iterator_category type
+    // if iterator_concept type is defined for iterator
+
+    // For input iterators parallel_for_each requires element to be
+    // copyable or movable so since cpp20_iterator is at least forward
+    // parallel_for_each should work with cpp20_iterator
+    // on non-copyable and non-movable type
+
+    // test cpp20_iterator implementation
+    using cpp20_forward_iterator = cpp20_iterator<int, std::forward_iterator_tag>;
+    using cpp20_bidirectional_iterator = cpp20_iterator<int, std::bidirectional_iterator_tag>;
+    using cpp20_random_access_iterator = cpp20_iterator<int, std::random_access_iterator_tag>;
+
+    static_assert(std::forward_iterator<cpp20_forward_iterator>);
+    static_assert(!std::bidirectional_iterator<cpp20_forward_iterator>);
+
+    static_assert(std::bidirectional_iterator<cpp20_bidirectional_iterator>);
+    static_assert(!std::random_access_iterator<cpp20_bidirectional_iterator>);
+
+    static_assert(std::random_access_iterator<cpp20_random_access_iterator>);
+
+    test_with_cpp20_iterator<std::forward_iterator_tag>();
+    test_with_cpp20_iterator<std::bidirectional_iterator_tag>();
+    test_with_cpp20_iterator<std::random_access_iterator_tag>();
+}
+
+#endif // __TBB_CPP20_PRESENT
diff --git a/third-party/tbb/test/tbb/test_resumable_tasks.cpp b/third-party/tbb/test/tbb/test_resumable_tasks.cpp
index a363a9ca..0cba9772 100644
--- a/third-party/tbb/test/tbb/test_resumable_tasks.cpp
+++ b/third-party/tbb/test/tbb/test_resumable_tasks.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -423,6 +423,7 @@ class TestCaseGuard {
 
 thread_local bool TestCaseGuard::m_local = false;
 
+#if !EMSCRIPTEN
 //! Nested test for suspend and resume
 //! \brief \ref error_guessing
 TEST_CASE("Nested test for suspend and resume") {
@@ -436,6 +437,7 @@ TEST_CASE("Nested arena") {
     TestCaseGuard guard;
     TestNestedArena();
 }
+#endif
 
 //! Test with external threads
 //! \brief \ref error_guessing
@@ -443,11 +445,13 @@ TEST_CASE("External threads") {
     TestNativeThread();
 }
 
+#if !EMSCRIPTEN
 //! Stress test with external threads
 //! \brief \ref stress
 TEST_CASE("Stress test with external threads") {
     TestCleanupMaster();
 }
+#endif
 
 //! Test with an arena observer
 //! \brief \ref error_guessing
diff --git a/third-party/tbb/test/tbb/test_scheduler_mix.cpp b/third-party/tbb/test/tbb/test_scheduler_mix.cpp
index c2c02bb7..8d8e0e37 100644
--- a/third-party/tbb/test/tbb/test_scheduler_mix.cpp
+++ b/third-party/tbb/test/tbb/test_scheduler_mix.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2021-2022 Intel Corporation
+    Copyright (c) 2021-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -522,7 +522,7 @@ enum ACTIONS {
     num_actions
 };
 
-void global_actor();
+void global_actor(size_t arenaAfterStealing);
 
 template <ACTIONS action>
 struct actor;
@@ -543,8 +543,13 @@ struct actor<arena_destroy> {
 
 template <>
 struct actor<arena_action> {
-    static void do_it(Random& r) {
+    static void do_it(Random& r, size_t arenaAfterStealing) {
         static thread_local std::size_t arenaLevel = 0;
+
+        // treat arenas index as priority: we own some resource already,
+        // so may pretend only to low-priority resource
+        arenaLevel = std::max(arenaLevel, arenaAfterStealing);
+
         ArenaTable::ScopedLock lock;
         auto entry = arenaTable.acquire(r, lock);
         if (entry.first) {
@@ -561,11 +566,13 @@ struct actor<arena_action> {
                     tbb::this_task_arena::enqueue([&wctx] { wctx.release(); });
                     tbb::detail::d1::wait(wctx, ctx);
                 } else {
-                    global_actor();
+                    global_actor(0);
                 }
             };
             switch (r.get() % (16*num_arena_actions)) {
             case arena_execute:
+                // to prevent deadlock, potentially blocking operation
+                // may be called only for arenas with larger index
                 if (entry.second > arenaLevel) {
                     gStats.notify(Statistics::ArenaExecute);
                     auto oldArenaLevel = arenaLevel;
@@ -579,7 +586,9 @@ struct actor<arena_action> {
                 utils_fallthrough;
             default:
                 gStats.notify(Statistics::ArenaEnqueue);
-                entry.first->enqueue([] { global_actor(); });
+                // after stealing by a worker, the task will run in arena
+                // with index entry.second
+                entry.first->enqueue([ entry ] { global_actor(entry.second); });
                 break;
             }
             arenaTable.release(lock);
@@ -601,7 +610,7 @@ struct actor<parallel_algorithm> {
         auto doGlbAction = rnd.get() % 1000 == 42;
         auto body = [doGlbAction, sz](int i) {
             if (i == sz / 2 && doGlbAction) {
-                global_actor();
+                global_actor(0);
             }
         };
 
@@ -621,7 +630,7 @@ struct actor<parallel_algorithm> {
     }
 };
 
-void global_actor() {
+void global_actor(size_t arenaAfterStealing) {
     static thread_local std::uint64_t localNumActions{};
 
     while (globalNumActions < maxNumActions) {
@@ -629,7 +638,7 @@ void global_actor() {
         switch (rnd.get() % num_actions) {
         case arena_create:  gStats.notify(Statistics::ArenaCreate); actor<arena_create>::do_it(rnd);  break;
         case arena_destroy: gStats.notify(Statistics::ArenaDestroy); actor<arena_destroy>::do_it(rnd); break;
-        case arena_action:  gStats.notify(Statistics::ArenaAcquire); actor<arena_action>::do_it(rnd);  break;
+        case arena_action:  gStats.notify(Statistics::ArenaAcquire); actor<arena_action>::do_it(rnd, arenaAfterStealing);  break;
         case parallel_algorithm: gStats.notify(Statistics::ParallelAlgorithm); actor<parallel_algorithm>::do_it(rnd);  break;
         }
 
@@ -656,7 +665,7 @@ TEST_CASE("Stress test with mixing functionality") {
     utils::SpinBarrier startBarrier{numExtraThreads};
     utils::NativeParallelFor(numExtraThreads, [&startBarrier](std::size_t) {
         startBarrier.wait();
-        global_actor();
+        global_actor(0);
     });
 
     arenaTable.shutdown();
diff --git a/third-party/tbb/test/tbb/test_task.cpp b/third-party/tbb/test/tbb/test_task.cpp
index dec24def..876e3510 100644
--- a/third-party/tbb/test/tbb/test_task.cpp
+++ b/third-party/tbb/test/tbb/test_task.cpp
@@ -771,7 +771,8 @@ TEST_CASE("Test with priority inversion") {
 
     auto high_priority_thread_func = [&] {
         // Increase external threads priority
-        utils::increase_thread_priority();
+        utils::increased_priority_guard guard{};
+        utils::suppress_unused_warning(guard);
         // pin external threads
         test_arena.execute([]{});
         while (task_counter++ < critical_task_counter) {
@@ -796,7 +797,8 @@ TEST_CASE("Test with priority inversion") {
         high_priority_threads.emplace_back(high_priority_thread_func);
     }
 
-    utils::increase_thread_priority();
+    utils::increased_priority_guard guard{};
+    utils::suppress_unused_warning(guard);
     while (task_counter++ < critical_task_counter) {
         submit(critical_task, test_arena, test_context, true);
         std::this_thread::sleep_for(std::chrono::milliseconds(1));
diff --git a/third-party/tbb/test/tbbmalloc/test_malloc_compliance.cpp b/third-party/tbb/test/tbbmalloc/test_malloc_compliance.cpp
index 224e2476..1a85ed58 100644
--- a/third-party/tbb/test/tbbmalloc/test_malloc_compliance.cpp
+++ b/third-party/tbb/test/tbbmalloc/test_malloc_compliance.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2023 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,6 +30,10 @@
 
 #include "oneapi/tbb/detail/_config.h"
 
+// There is no RLIMIT_AS on OpenBSD.
+// Therefore, the tests for memory limit is unreasonable.
+#if !__OpenBSD__
+
 #define __TBB_NO_IMPLICIT_LINKAGE 1
 #include "tbb/scalable_allocator.h"
 
@@ -1091,3 +1095,4 @@ TEST_CASE("MAIN TEST") {
 }
 
 #endif /* __TBB_WIN8UI_SUPPORT	 */
+#endif /* Enable test */
diff --git a/third-party/tbb/test/tbbmalloc/test_malloc_whitebox.cpp b/third-party/tbb/test/tbbmalloc/test_malloc_whitebox.cpp
index 0f37e9f4..9de151e0 100644
--- a/third-party/tbb/test/tbbmalloc/test_malloc_whitebox.cpp
+++ b/third-party/tbb/test/tbbmalloc/test_malloc_whitebox.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -1257,7 +1257,11 @@ void TestTHP() {
     scalable_allocation_mode(USE_HUGE_PAGES, 1);
     REQUIRE_MESSAGE(hugePages.isEnabled, "Huge pages should be enabled via scalable_allocation_mode");
 
+#if defined __loongarch64
+    const int HUGE_PAGE_SIZE = 32 * 1024 * 1024;
+#else
     const int HUGE_PAGE_SIZE = 2 * 1024 * 1024;
+#endif
 
     // allocCount transparent huge pages should be allocated
     const int allocCount = 10;
diff --git a/third-party/tbb/third-party-programs.txt b/third-party/tbb/third-party-programs.txt
index b555450a..c088429c 100644
--- a/third-party/tbb/third-party-programs.txt
+++ b/third-party/tbb/third-party-programs.txt
@@ -1,58 +1,55 @@
 oneAPI Threading Building Blocks (oneTBB) Third Party Programs File
 
-This file contains the list of third party software ("third party programs")
-contained in the Intel software and their required notices and/or license terms.
-This third party software, even if included with the distribution of the Intel
-software, may be governed by separate license terms, including without limitation,
-third party license terms, other Intel software license terms, and open source
-software license terms. These separate license terms govern your use of the third
-party programs as set forth in the "third-party-programs.txt" or other similarlynamed text file.
+This file is the "third-party-programs.txt" file specified  in  the  associated Intel end user license
+agreement for the Intel software you are licensing.
 
 The third party programs and their corresponding required notices and/or license
 terms are listed below.
 _______________________________________________________________________________________________________
 
-1.  Intel(R) Instrumentation and Tracing Technology (ITT)
-    Copyright (c) 2022 Intel Corporation. All rights reserved.
+1.  Instrumentation and Tracing Technology (ITT) Notify User API:
+    Copyright (c) 2005-2023 Intel Corporation. All rights reserved.
 
-    Redistribution and use in source and binary forms, with or without modification, 
-    are permitted provided that the following conditions are met:
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
 
     1. Redistributions of source code must retain the above copyright
        notice, this list of conditions and the following disclaimer.
     2. Redistributions in binary form must reproduce the above copyright
-       notice, this list of conditions and the following disclaimer in the
-       documentation and/or other materials provided with the distribution.
-    3. Neither the name of the copyright holder nor the names of its contributors may be
-       used to endorse or promote products derived from this software
-       without specific prior written permission.
-    
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-    OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-    OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-    SUCH DAMAGE.
+       notice, this list of conditions and the following disclaimer in 
+       the documentation and/or other materials provided with the 
+       distribution.
+    3. Neither the name of the copyright holder nor the names of its
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 _______________________________________________________________________________________________________
 
-2.  ActiveState Thread pool with same API as (multi)  processing.Pool (Python recipe):
-    Copyright (c) 2008,2016 david decotigny (this file)
-    Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool)
+2.  Portable Hardware Locality (hwloc):
 
-    Portable Hardware Locality (hwloc) 
-    Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana University Research and Technology Corporation.  All rights reserved.
-    Copyright (c) 2004-2005 The University of Tennessee and The University of Tennessee Research Foundation.  All rights reserved.
-    Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, University of Stuttgart.  All rights reserved.
+    Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana University Research and
+                            Technology Corporation.  All rights reserved.
+    Copyright (c) 2004-2005 The University of Tennessee and The University of Tennessee Research
+                            Foundation. All rights reserved.
+    Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, University of Stuttgart.
+                            All rights reserved.
     Copyright (c) 2004-2005 The Regents of the University of California. All rights reserved.
     Copyright (c) 2009      CNRS
     Copyright (c) 2009-2016 Inria.  All rights reserved.
-    Copyright (c) 2009-2015 Universit� Bordeaux
+    Copyright (c) 2009-2015 Université Bordeaux
     Copyright (c) 2009-2015 Cisco Systems, Inc.  All rights reserved.
     Copyright (c) 2009-2012 Oracle and/or its affiliates.  All rights reserved.
     Copyright (c) 2010      IBM
@@ -60,35 +57,32 @@ ________________________________________________________________________________
     Copyright (c) 2012      Aleksej Saushev, The NetBSD Foundation
     Copyright (c) 2012      Blue Brain Project, EPFL. All rights reserved.
     Copyright (c) 2013-2014 University of Wisconsin-La Crosse. All rights reserved.
-    Copyright (c) 2015      Research Organization for Information Science and Technology (RIST). All rights reserved.
+    Copyright (c) 2015      Research Organization for Information Science and Technology (RIST).
+                            All rights reserved.
     Copyright (c) 2015-2016 Intel, Inc.  All rights reserved.
-
-    BSD 3-clause "New" or "Revised" License
+    See COPYING in top-level directory.
 
     Redistribution and use in source and binary forms, with or without
     modification, are permitted provided that the following conditions
     are met:
-
     1. Redistributions of source code must retain the above copyright
        notice, this list of conditions and the following disclaimer.
     2. Redistributions in binary form must reproduce the above copyright
        notice, this list of conditions and the following disclaimer in the
        documentation and/or other materials provided with the distribution.
-    3. Neither the name of author nor the names of any contributors may be
-       used to endorse or promote products derived from this software
-       without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
-    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-    ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
-    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-    OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-    OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-    SUCH DAMAGE.
+    3. The name of the author may not be used to endorse or promote products
+       derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+    OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+    IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+    NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+    THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 _______________________________________________________________________________________________________
 
 3.  gperftools: Copyright (c) 2011, Google Inc.
@@ -126,268 +120,60 @@ ________________________________________________________________________________
 
 4.  Mateusz Kwiatkowski Workaround for bug 62258 in libstdc++
 
-    GPL 3.0 with GCC Runtime Library Exception 3.1
-
-	GNU GENERAL PUBLIC LICENSE
-
-	Version 3, 29 June 2007
-
-	Copyright (c) 2007 Free Software Foundation, Inc. <https://fsf.org/>
-
-	Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
-
-	Preamble
-	The GNU General Public License is a free, copyleft license for software and other kinds of works.
-
-	The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too.
-
-	When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things.
-
-	To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others.
-
-	For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights.
-
-	Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it.
-
-	For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions.
-
-	Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users.
-
-	Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free.
-
-	The precise terms and conditions for copying, distribution and modification follow.
-
-	TERMS AND CONDITIONS
-	0. Definitions.
-	"This License" refers to version 3 of the GNU General Public License.
-
-	"Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.
-
-	"The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations.
-
-	To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work.
-
-	A "covered work" means either the unmodified Program or a work based on the Program.
-
-	To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well.
-
-	To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying.
-
-	An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion.
-
-	1. Source Code.
-	The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work.
-
-	A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.
-
-	The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.
-
-	The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work.
-
-	The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source.
-
-	The Corresponding Source for a work in source code form is that same work.
-
-	2. Basic Permissions.
-	All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.
-
-	You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.
-
-	Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
-
-	3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-	No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.
-
-	When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures.
-
-	4. Conveying Verbatim Copies.
-	You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.
-
-	You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.
-
-	5. Conveying Modified Source Versions.
-	You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:
-
-	a) The work must carry prominent notices stating that you modified it, and giving a relevant date.
-	b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices".
-	c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it.
-	d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so.
-	A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.
-
-	6. Conveying Non-Source Forms.
-	You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:
-
-	a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange.
-	b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge.
-	c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b.
-	d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements.
-	e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d.
-	A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work.
-
-	A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product.
-
-	"Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made.
-
-	If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM).
-
-	The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network.
-
-	Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying.
-
-	7. Additional Terms.
-	"Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.
-
-	When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.
-
-	Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:
-
-	a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or
-	b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or
-	c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or
-	d) Limiting the use for publicity purposes of names of licensors or authors of the material; or
-	e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or
-	f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors.
-	All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying.
-
-	If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms.
-
-	Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way.
-
-	8. Termination.
-	You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).
-
-	However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.
-
-	Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.
-
-	Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.
-
-	9. Acceptance Not Required for Having Copies.
-	You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.
-
-	10. Automatic Licensing of Downstream Recipients.
-	Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License.
-
-	An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.
-
-	You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.
-
-	11. Patents.
-	A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version".
-
-	A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.
-
-	Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.
-
-	In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party.
-
-	If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid.
-
-	If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it.
-
-	A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007.
-
-	Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law.
-
-	12. No Surrender of Others' Freedom.
-	If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.
-
-	13. Use with the GNU Affero General Public License.
-	Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such.
-
-	14. Revised Versions of this License.
-	The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
-
-	Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation.
-
-	If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program.
-
-	Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.
-
-	15. Disclaimer of Warranty.
-	THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-	16. Limitation of Liability.
-	IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-	17. Interpretation of Sections 15 and 16.
-	If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.
-
-	END OF TERMS AND CONDITIONS
-
-	How to Apply These Terms to Your New Programs
-	If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms.
-
-	To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found.
-
-		<one line to give the program's name and a brief idea of what it does.>
-		Copyright (C) <year>  <name of author>
-
-		This program is free software: you can redistribute it and/or modify
-		it under the terms of the GNU General Public License as published by
-		the Free Software Foundation, either version 3 of the License, or
-		(at your option) any later version.
-
-		This program is distributed in the hope that it will be useful,
-		but WITHOUT ANY WARRANTY; without even the implied warranty of
-		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-		GNU General Public License for more details.
-
-		You should have received a copy of the GNU General Public License
-		along with this program.  If not, see <https://www.gnu.org/licenses/>.
-	Also add information on how to contact you by electronic and paper mail.
-
-	If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode:
-
-		<program>  Copyright (C) <year>  <name of author>
-		This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-		This is free software, and you are welcome to redistribute it
-		under certain conditions; type `show c' for details.
-	The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box".
-
-	You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see <https://www.gnu.org/licenses/>.
-
-	The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read <https://www.gnu.org/licenses/why-not-lgpl.html>.
-
-
-	GCC RUNTIME LIBRARY EXCEPTION
-
-	Version 3.1, 31 March 2009
-
-	Copyright (c) 2009 Free Software Foundation, Inc. <https://fsf.org/>
-
-	Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
-
-	This GCC Runtime Library Exception ("Exception") is an additional permission under section 7 of the GNU General Public License, version 3 ("GPLv3"). It applies to a given file (the "Runtime Library") that bears a notice placed by the copyright holder of the file stating that the file is governed by GPLv3 along with this Exception.
-
-	When you use GCC to compile a program, GCC may combine portions of certain GCC header files and runtime libraries with the compiled program. The purpose of this Exception is to allow compilation of non-GPL (including proprietary) programs to use, in this way, the header files and runtime libraries covered by this Exception.
-
-	0. Definitions.
-	A file is an "Independent Module" if it either requires the Runtime Library for execution after a Compilation Process, or makes use of an interface provided by the Runtime Library, but is not otherwise based on the Runtime Library.
-
-	"GCC" means a version of the GNU Compiler Collection, with or without modifications, governed by version 3 (or a specified later version) of the GNU General Public License (GPL) with the option of using any subsequent versions published by the FSF.
-
-	"GPL-compatible Software" is software whose conditions of propagation, modification and use would permit combination with GCC in accord with the license of GCC.
-
-	"Target Code" refers to output from any compiler for a real or virtual target processor architecture, in executable form or suitable for input to an assembler, loader, linker and/or execution phase. Notwithstanding that, Target Code does not include data in any format that is used as a compiler intermediate representation, or used for producing a compiler intermediate representation.
-
-	The "Compilation Process" transforms code entirely represented in non-intermediate languages designed for human-written code, and/or in Java Virtual Machine byte code, into Target Code. Thus, for example, use of source code generators and preprocessors need not be considered part of the Compilation Process, since the Compilation Process can be understood as starting with the output of the generators or preprocessors.
-
-	A Compilation Process is "Eligible" if it is done using GCC, alone or with other GPL-compatible software, or if it is done without using any work based on GCC. For example, using non-GPL-compatible Software to optimize any GCC intermediate representations would not qualify as an Eligible Compilation Process.
-
-	1. Grant of Additional Permission.
-	You have permission to propagate a work of Target Code formed by combining the Runtime Library with Independent Modules, even if such propagation would otherwise violate the terms of GPLv3, provided that all Target Code was generated by Eligible Compilation Processes. You may then convey such a combination under terms of your choice, consistent with the licensing of the Independent Modules.
+    ********************************************************************************
+    * Author: Mateusz Kwiatkowski <m.kwiatkowski@avsystem.com>                     *
+    *                                                                              *
+    * I hereby renounce all copyright to this file and my rights resulting from    *
+    * it, to the broadest extent permitted by law. It may be treated as public     *
+    * domain.                                                                      *
+    *                                                                              *
+    * However, as this file interfaces with GCC internal ABI, it may be subject to *
+    * the terms and conditions of the GNU General Public License. Please consult   *
+    * the GCC licensing terms and/or a lawyer for details.                         *
+    *                                                                              *
+    * Note that libstdc++ licensing terms grant additional permissions described   *
+    * in the GCC Runtime Library Exception, version 3.1, as published by the       *
+    * Free Software Foundation.                                                    *
+    *******************************************************************************/
+_______________________________________________________________________________________________________
 
-	2. No Weakening of GCC Copyleft.
-	The availability of this Exception does not imply any general presumption that third-party software is unaffected by the copyleft requirements of the license of GCC.
+5. ActiveState Thread pool with same API as (multi) processing. Pool (Python recipe)
+
+    #
+    # Copyright (c) 2008,2016 david decotigny (this file)
+    # Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool)
+    # All rights reserved.
+    #
+    # Redistribution and use in source and binary forms, with or without
+    # modification, are permitted provided that the following conditions
+    # are met:
+    #
+    # 1. Redistributions of source code must retain the above copyright
+    #    notice, this list of conditions and the following disclaimer.
+    # 2. Redistributions in binary form must reproduce the above copyright
+    #    notice, this list of conditions and the following disclaimer in the
+    #    documentation and/or other materials provided with the distribution.
+    # 3. Neither the name of author nor the names of any contributors may be
+    #    used to endorse or promote products derived from this software
+    #    without specific prior written permission.
+    #
+    # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+    # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+    # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+    # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+    # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+    # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+    # SUCH DAMAGE.
 
 _______________________________________________________________________________________________________
 
-5.  Doctest
-
-    Copyright (c) 2016-2021 Viktor Kirilov
+6. doctest
 
-    The MIT License (MIT)
+    Copyright (c) 2016-2023 Viktor Kirilov
 
     Permission is hereby granted, free of charge, to any person obtaining a copy
     of this software and associated documentation files (the "Software"), to deal
@@ -406,6 +192,7 @@ ________________________________________________________________________________
     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     SOFTWARE.
+
 _______________________________________________________________________________________________________
 
-*Other names and brands may be claimed as the property of others.
+*Other names and brands may be claimed as the property of others.
\ No newline at end of file